11#include " Clustering.h"
22#include " ClusteringAlgorithms.h"
3+ #include " AlignmentSymmetry.h"
34#include " Debug.h"
45#include " Util.h"
56#include " itoa.h"
67#include " Timer.h"
78#include " SequenceWeights.h"
9+ #include < fstream>
810
911Clustering::Clustering (const std::string &seqDB, const std::string &seqDBIndex,
1012 const std::string &alnDB, const std::string &alnDBIndex,
1113 const std::string &outDB, const std::string &outDBIndex,
1214 const std::string &sequenceWeightFile,
13- unsigned int maxIteration, int similarityScoreType, int threads, int compressed) : maxIteration(maxIteration),
15+ unsigned int maxIteration, int similarityScoreType, int threads, int compressed, bool needSET) : needSET(needSET),
16+ maxIteration(maxIteration),
1417 similarityScoreType(similarityScoreType),
1518 threads(threads),
1619 compressed(compressed),
1720 outDB(outDB),
1821 outDBIndex(outDBIndex) {
1922
2023 seqDbr = new DBReader<unsigned int >(seqDB.c_str (), seqDBIndex.c_str (), threads, DBReader<unsigned int >::USE_INDEX);
21-
24+ alnDbr = new DBReader<unsigned int >(alnDB.c_str (), alnDBIndex.c_str (), threads, DBReader<unsigned int >::USE_DATA|DBReader<unsigned int >::USE_INDEX);
25+ alnDbr->open (DBReader<unsigned int >::NOSORT);
2226 if (!sequenceWeightFile.empty ()) {
23-
2427 seqDbr->open (DBReader<unsigned int >::SORT_BY_ID);
25-
2628 SequenceWeights *sequenceWeights = new SequenceWeights (sequenceWeightFile.c_str ());
2729 float *localid2weight = new float [seqDbr->getSize ()];
2830 for (size_t id = 0 ; id < seqDbr->getSize (); id++) {
@@ -33,29 +35,153 @@ Clustering::Clustering(const std::string &seqDB, const std::string &seqDBIndex,
3335 delete[] localid2weight;
3436 delete sequenceWeights;
3537
36- } else
37- seqDbr->open (DBReader<unsigned int >::SORT_BY_LENGTH);
38+ } else {
39+ if (needSET == false ) {
40+ seqDbr->open (DBReader<unsigned int >::SORT_BY_LENGTH);
41+ } else {
42+ DBReader<unsigned int > *originalseqDbr = new DBReader<unsigned int >(seqDB.c_str (), seqDBIndex.c_str (), threads, DBReader<unsigned int >::USE_INDEX);
43+ originalseqDbr->open (DBReader<unsigned int >::NOSORT);
44+ DBReader<unsigned int >::Index * seqIndex = originalseqDbr->getIndex ();
45+
46+ unsigned int lastKey = originalseqDbr->getLastKey ();
47+ keyToSet = new unsigned int [lastKey+1 ];
48+ std::vector<bool > keysInSeq (lastKey+1 , false );
49+ std::map<unsigned int , unsigned int > setToLength;
50+
51+ std::ifstream mappingStream (seqDB + " .lookup" );
52+ std::string line;
53+ unsigned int setkey = 0 ;
54+ while (std::getline (mappingStream, line)) {
55+ std::vector<std::string> split = Util::split (line, " \t " );
56+ unsigned int key = strtoul (split[0 ].c_str (), NULL , 10 );
57+ setkey = strtoul (split[2 ].c_str (), NULL , 10 );
58+ keyToSet[key] = setkey;
59+ }
60+ for (size_t id = 0 ; id < originalseqDbr->getSize (); id++) {
61+ setToLength[keyToSet[seqIndex[id].id ]] += seqIndex[id].length ;
62+ keysInSeq[seqIndex[id].id ] = 1 ;
63+ }
64+ unsigned int sourceLen = setkey + 1 ;
65+ seqnum = setToLength.size ();
66+ sourceList = new (std::nothrow) unsigned int [lastKey];
67+ sourceOffsets = new (std::nothrow) size_t [sourceLen + 1 ];
68+ sourceLookupTable = new (std::nothrow) unsigned int *[sourceLen];
69+
70+ mappingStream.close ();
71+ mappingStream.open (seqDB + " .lookup" );
72+ line = " " ;
73+ unsigned int prevsetkey = UINT_MAX;
74+ size_t n = 0 ;
75+ size_t lookupOrder = 0 ;
76+ setkey = UINT_MAX;
77+ while (std::getline (mappingStream, line)) {
78+ std::vector<std::string> split = Util::split (line, " \t " );
79+ unsigned int key = strtoul (split[0 ].c_str (), NULL , 10 );
80+ setkey = strtoul (split[2 ].c_str (), NULL , 10 );
81+ if (setkey != prevsetkey) {
82+ if (prevsetkey != UINT_MAX){
83+ sourceOffsets[prevsetkey] = n;
84+ for (size_t k = prevsetkey+1 ; k<setkey; k++) {
85+ sourceOffsets[k] = 0 ;
86+ }
87+ }
88+ prevsetkey = setkey;
89+ if (keysInSeq[key] == 1 ) {
90+ sourceKeyVec.emplace_back (setkey);
91+ }
92+ n = 0 ;
93+ }
94+ if (keysInSeq[key] == 1 ) {
95+ sourceList[lookupOrder] = key;
96+ } else {
97+ sourceList[lookupOrder] = UINT_MAX;
98+ }
99+ n++;
100+ lookupOrder++;
101+ }
102+ sourceOffsets[prevsetkey] = n;
103+ AlignmentSymmetry::computeOffsetFromCounts (sourceOffsets, sourceLen);
104+ AlignmentSymmetry::setupPointers<unsigned int >(sourceList, sourceLookupTable, sourceOffsets, sourceLen, lastKey);
105+
106+ char * data = (char *)malloc (
107+ sizeof (size_t ) +
108+ sizeof (size_t ) +
109+ sizeof (unsigned int ) +
110+ sizeof (int ) +
111+ sizeof (unsigned int ) +
112+ sizeof (DBReader<unsigned int >::Index) * seqnum
113+ );
114+
115+ std::vector<DBReader<unsigned int >::Index*> indexStorage (seqnum);
116+
117+ n = 0 ;
118+ for (const auto & pairs : setToLength) {
119+ indexStorage[n] = new DBReader<unsigned int >::Index;
120+ indexStorage[n]->id = pairs.first ;
121+ indexStorage[n]->length = pairs.second ;
122+ indexStorage[n]->offset = 0 ;
123+ n++;
124+ }
125+
126+ char * p = data;
127+ *((size_t *)p) = seqnum;
128+ p += sizeof (size_t );
129+ *((size_t *)p) = 0 ;
130+ p += sizeof (size_t );
131+ *((unsigned int *)p) = indexStorage[seqnum-1 ]->id ;
132+ p += sizeof (unsigned int );
133+ *((int *)p) = originalseqDbr->getDbtype ();
134+ p += sizeof (int );
135+ *((unsigned int *)p) = indexStorage[0 ]->length ;
136+ p += sizeof (unsigned int );
137+ for (size_t i = 0 ; i < seqnum; ++i) {
138+ memcpy (
139+ p + i * sizeof (DBReader<unsigned int >::Index),
140+ indexStorage[i],
141+ sizeof (DBReader<unsigned int >::Index)
142+ );
143+ }
144+ p += sizeof (DBReader<unsigned int >::Index) * seqnum;
145+ seqDbr = DBReader<unsigned int >::unserialize (data, threads);
146+ seqDbr->open (DBReader<unsigned int >::SORT_BY_LENGTH);
147+ for (auto * ptr : indexStorage) {
148+ delete ptr;
149+ }
150+ }
151+ }
38152
39- alnDbr = new DBReader<unsigned int >(alnDB.c_str (), alnDBIndex.c_str (), threads, DBReader<unsigned int >::USE_DATA|DBReader<unsigned int >::USE_INDEX);
40- alnDbr->open (DBReader<unsigned int >::NOSORT);
41153
42154}
43155
44156Clustering::~Clustering () {
45157 delete seqDbr;
46158 delete alnDbr;
159+ if (needSET){
160+ delete keyToSet;
161+ delete sourceOffsets;
162+ delete sourceList;
163+ delete[] sourceLookupTable;
164+ }
47165}
48166
49167
50168void Clustering::run (int mode) {
51169 Timer timer;
52- DBWriter *dbw = new DBWriter (outDB.c_str (), outDBIndex.c_str (), 1 , compressed, Parameters::DBTYPE_CLUSTER_RES);
170+
171+ unsigned int dbType = Parameters::DBTYPE_CLUSTER_RES;
172+ unsigned int dbTypeSet = DBReader<unsigned int >::setExtendedDbtype (dbType, Parameters::DBTYPE_EXTENDED_SET);
173+ DBWriter *dbw;
174+ if (needSET) {
175+ dbw = new DBWriter (outDB.c_str (), outDBIndex.c_str (), 1 , compressed, dbTypeSet);
176+ } else {
177+ dbw = new DBWriter (outDB.c_str (), outDBIndex.c_str (), 1 , compressed, dbType);
178+ }
53179 dbw->open ();
54180
55181 std::pair<unsigned int , unsigned int > * ret;
56182 ClusteringAlgorithms *algorithm = new ClusteringAlgorithms (seqDbr, alnDbr,
57183 threads, similarityScoreType,
58- maxIteration);
184+ maxIteration, keyToSet, sourceOffsets, sourceLookupTable, sourceList, seqnum, needSET );
59185
60186 if (mode == Parameters::GREEDY) {
61187 Debug (Debug::INFO) << " Clustering mode: Greedy\n " ;
@@ -79,7 +205,7 @@ void Clustering::run(int mode) {
79205 size_t dbSize = alnDbr->getSize ();
80206 size_t seqDbSize = seqDbr->getSize ();
81207 size_t cluNum = (dbSize > 0 ) ? 1 : 0 ;
82- for (size_t i = 1 ; i < dbSize ; i++){
208+ for (size_t i = 1 ; i < seqDbSize ; i++){
83209 cluNum += (ret[i].first != ret[i-1 ].first );
84210 }
85211 Debug (Debug::INFO) << " Total time: " << timer.lap () << " \n " ;
@@ -88,11 +214,10 @@ void Clustering::run(int mode) {
88214 Debug (Debug::INFO) << " Number of clusters: " << cluNum << " \n\n " ;
89215
90216 Debug (Debug::INFO) << " Writing results " ;
91- writeData (dbw, ret, dbSize );
217+ writeData (dbw, ret, seqDbSize );
92218 Debug (Debug::INFO) << timerWrite.lap () << " \n " ;
93219 delete [] ret;
94220 delete algorithm;
95-
96221 dbw->close (false , false );
97222 seqDbr->close ();
98223 alnDbr->close ();
0 commit comments