Skip to content

Commit c0bff1e

Browse files
Add --index-subset 8 to avoid storing the sequence lookup
1 parent 2348eb6 commit c0bff1e

File tree

3 files changed

+30
-27
lines changed

3 files changed

+30
-27
lines changed

src/commons/Parameters.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ Parameters::Parameters():
198198
// indexdb
199199
PARAM_CHECK_COMPATIBLE(PARAM_CHECK_COMPATIBLE_ID, "--check-compatible", "Check compatible", "0: Always recreate index, 1: Check if recreating index is needed, 2: Fail if index is incompatible", typeid(int), (void *) &checkCompatible, "^[0-2]{1}$", MMseqsParameter::COMMAND_MISC),
200200
PARAM_SEARCH_TYPE(PARAM_SEARCH_TYPE_ID, "--search-type", "Search type", "Search type 0: auto 1: amino acid, 2: translated, 3: nucleotide, 4: translated nucleotide alignment", typeid(int), (void *) &searchType, "^[0-4]{1}"),
201-
PARAM_INDEX_SUBSET(PARAM_INDEX_SUBSET_ID, "--index-subset", "Index subset", "Create specialized index with subset of entries\n0: normal index\n1: index without headers\n2: index without prefiltering data\n4: index without aln (for cluster db)\nFlags can be combined bit wise", typeid(int), (void *) &indexSubset, "^[0-7]{1}", MMseqsParameter::COMMAND_EXPERT),
201+
PARAM_INDEX_SUBSET(PARAM_INDEX_SUBSET_ID, "--index-subset", "Index subset", "Create specialized index with subset of entries\n0: normal index\n1: index without headers\n2: index without prefiltering data\n4: index without aln (for cluster db)\n8: no sequence lookup (good for GPU only searches)\nFlags can be combined bit wise", typeid(int), (void *) &indexSubset, "^[0-7]{1}", MMseqsParameter::COMMAND_EXPERT),
202202
PARAM_INDEX_DBSUFFIX(PARAM_INDEX_DBSUFFIX_ID, "--index-dbsuffix", "Index dbsuffix", "A suffix of the db (used for cluster dbs)", typeid(std::string), (void *) &indexDbsuffix, "", MMseqsParameter::COMMAND_HIDDEN),
203203
// createdb
204204
PARAM_USE_HEADER(PARAM_USE_HEADER_ID, "--use-fasta-header", "Use fasta header", "Use the id parsed from the fasta header as the index key instead of using incrementing numeric identifiers", typeid(bool), (void *) &useHeader, ""),

src/commons/Parameters.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,7 @@ class Parameters {
192192
static const int INDEX_SUBSET_NO_HEADERS = 1;
193193
static const int INDEX_SUBSET_NO_PREFILTER = 2;
194194
static const int INDEX_SUBSET_NO_ALIGNMENT = 4;
195+
static const int INDEX_SUBSET_NO_SEQUENCE_LOOKUP = 8;
195196

196197

197198
static std::vector<int> getOutputFormat(int formatMode, const std::string &outformat, bool &needSequences, bool &needBacktrace, bool &needFullHeaders,

src/prefiltering/PrefilteringIndexReader.cpp

Lines changed: 28 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,9 @@ void PrefilteringIndexReader::createIndexFile(const std::string &outDB,
5858
bool compBiasCorrection, int alphabetSize, int kmerSize, int maskMode,
5959
int maskLowerCase, float maskProb, int maskNrepeats, int kmerThr, int targetSearchMode, int splits,
6060
int indexSubset) {
61-
const bool noKmerIndex = (indexSubset & Parameters::INDEX_SUBSET_NO_PREFILTER) != 0;
62-
if (noKmerIndex) {
61+
const bool needKmerIndex = (indexSubset & Parameters::INDEX_SUBSET_NO_PREFILTER) == 0;
62+
const bool needSequenceLookup = (indexSubset & Parameters::INDEX_SUBSET_NO_SEQUENCE_LOOKUP) == 0;
63+
if (needKmerIndex == false) {
6364
splits = 1;
6465
}
6566

@@ -196,7 +197,7 @@ void PrefilteringIndexReader::createIndexFile(const std::string &outDB,
196197

197198
ScoreMatrix s3;
198199
ScoreMatrix s2;
199-
if (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_HMM_PROFILE) == false && noKmerIndex == false) {
200+
if (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_HMM_PROFILE) == false && needKmerIndex == true) {
200201
int alphabetSize = subMat->alphabetSize;
201202
subMat->alphabetSize = subMat->alphabetSize-1;
202203
s3 = ExtendedSubstitutionMatrix::calcScoreMatrix(*subMat, 3);
@@ -225,22 +226,23 @@ void PrefilteringIndexReader::createIndexFile(const std::string &outDB,
225226
}
226227

227228
IndexTable * indexTable;
228-
if(noKmerIndex){
229-
indexTable = NULL;
230-
} else {
229+
if(needKmerIndex){
231230
indexTable = new IndexTable(adjustAlphabetSize, kmerSize, false);
231+
} else {
232+
indexTable = NULL;
232233
}
233234
SequenceLookup *sequenceLookup = NULL;
234-
IndexBuilder::fillDatabase(indexTable, &sequenceLookup,
235-
*subMat, s3, s2, &seq, dbr1, dbFrom, dbFrom + dbSize, kmerThr,
236-
maskMode, maskLowerCase, maskProb, maskNrepeats, targetSearchMode);
237-
235+
if(needKmerIndex || needSequenceLookup){
236+
IndexBuilder::fillDatabase(indexTable, &sequenceLookup,
237+
*subMat, s3, s2, &seq, dbr1, dbFrom, dbFrom + dbSize, kmerThr,
238+
maskMode, maskLowerCase, maskProb, maskNrepeats, targetSearchMode);
239+
}
238240
if (sequenceLookup == NULL) {
239241
Debug(Debug::ERROR) << "Invalid mask mode. No sequence lookup created!\n";
240242
EXIT(EXIT_FAILURE);
241243
}
242244
unsigned int keyOffset = 1000 * s;
243-
if(noKmerIndex == false){
245+
if(needKmerIndex){
244246
indexTable->printStatistics(subMat->num2aa);
245247
// save the entries
246248
Debug(Debug::INFO) << "Write ENTRIES (" << (keyOffset + ENTRIES) << ")\n";
@@ -272,21 +274,21 @@ void PrefilteringIndexReader::createIndexFile(const std::string &outDB,
272274
writer.writeData(tablesizePtr, 1 * sizeof(size_t), (keyOffset + SEQCOUNT), SPLIT_INDX + s);
273275
writer.alignToPageSize(SPLIT_INDX + s);
274276

275-
Debug(Debug::INFO) << "Write SEQINDEXDATASIZE (" << (keyOffset + SEQINDEXDATASIZE) << ")\n";
276-
int64_t seqindexDataSize = sequenceLookup->getDataSize();
277-
char *seqindexDataSizePtr = (char *) &seqindexDataSize;
278-
writer.writeData(seqindexDataSizePtr, 1 * sizeof(int64_t), (keyOffset + SEQINDEXDATASIZE), SPLIT_INDX + s);
279-
writer.alignToPageSize(SPLIT_INDX + s);
280-
281-
size_t *sequenceOffsets = sequenceLookup->getOffsets();
282-
size_t sequenceCount = sequenceLookup->getSequenceCount();
283-
Debug(Debug::INFO) << "Write SEQINDEXSEQOFFSET (" << (keyOffset + SEQINDEXSEQOFFSET) << ")\n";
284-
writer.writeData((char *) sequenceOffsets, (sequenceCount + 1) * sizeof(size_t), (keyOffset + SEQINDEXSEQOFFSET), SPLIT_INDX + s);
285-
writer.alignToPageSize(SPLIT_INDX + s);
286-
287-
Debug(Debug::INFO) << "Write SEQINDEXDATA (" << (keyOffset + SEQINDEXDATA) << ")\n";
288-
writer.writeData(sequenceLookup->getData(), (sequenceLookup->getDataSize() + 1) * sizeof(char), (keyOffset + SEQINDEXDATA), SPLIT_INDX + s);
289-
writer.alignToPageSize(SPLIT_INDX + s);
277+
if(needSequenceLookup){
278+
Debug(Debug::INFO) << "Write SEQINDEXDATASIZE (" << (keyOffset + SEQINDEXDATASIZE) << ")\n";
279+
int64_t seqindexDataSize = sequenceLookup->getDataSize();
280+
char *seqindexDataSizePtr = (char *) &seqindexDataSize;
281+
writer.writeData(seqindexDataSizePtr, 1 * sizeof(int64_t), (keyOffset + SEQINDEXDATASIZE), SPLIT_INDX + s);
282+
writer.alignToPageSize(SPLIT_INDX + s);
283+
size_t *sequenceOffsets = sequenceLookup->getOffsets();
284+
size_t sequenceCount = sequenceLookup->getSequenceCount();
285+
Debug(Debug::INFO) << "Write SEQINDEXSEQOFFSET (" << (keyOffset + SEQINDEXSEQOFFSET) << ")\n";
286+
writer.writeData((char *) sequenceOffsets, (sequenceCount + 1) * sizeof(size_t), (keyOffset + SEQINDEXSEQOFFSET), SPLIT_INDX + s);
287+
writer.alignToPageSize(SPLIT_INDX + s);
288+
Debug(Debug::INFO) << "Write SEQINDEXDATA (" << (keyOffset + SEQINDEXDATA) << ")\n";
289+
writer.writeData(sequenceLookup->getData(), (sequenceLookup->getDataSize() + 1) * sizeof(char), (keyOffset + SEQINDEXDATA), SPLIT_INDX + s);
290+
writer.alignToPageSize(SPLIT_INDX + s);
291+
}
290292
delete sequenceLookup;
291293
if(indexTable != NULL){
292294
delete indexTable;

0 commit comments

Comments
 (0)