Skip to content

Commit 40acb33

Browse files
committed
Fix seg fault when accessing position 1 in reverse strand, make a local gff2db
1 parent 42b2d83 commit 40acb33

File tree

4 files changed

+222
-1
lines changed

4 files changed

+222
-1
lines changed

src/LocalCommandDeclarations.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
//extern int easypredict(int argc, const char **argv, const Command& command);
77
extern int createsetdb(int argc, const char **argv, const Command& command);
8+
extern int gff2db(int argc, const char **argv, const Command& command);
89
extern int aa2foldseek(int argc, const char **argv, const Command& command);
910
extern int clustersearch(int argc, const char **argv, const Command& command);
1011
extern int clusterhits(int argc, const char **argv, const Command& command);

src/spacedust.cpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,15 @@ std::vector<Command> spacedustCommands = {
4141
CITATION_MMSEQS2, {{"fast[a|q]File[.gz|bz2]", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &FlatfileAndFolder },
4242
{"setDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
4343
{"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory}}},
44-
{"aa2foldseek", aa2foldseek, &localPar.aa2foldseek, COMMAND_MAIN,
44+
{"gff2db", gff2db, &localPar.gff2db, COMMAND_SPECIAL,
45+
"Extract regions from a sequence database based on a GFF3 file",
46+
NULL,
47+
"Ruoshi Zhang <[email protected]>",
48+
"<i:gff3File1> ... <i:gff3FileN> <i:sequenceDB> <o:sequenceDB>",
49+
CITATION_MMSEQS2, {{"gff3File", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfile},
50+
{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb},
51+
{"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::sequenceDb}}},
52+
{"aa2foldseek", aa2foldseek, &localPar.aa2foldseek, COMMAND_MAIN,
4553
"Map a sequence DB to reference foldseek DB",
4654
NULL,
4755
"Ruoshi Zhang <[email protected]> & Milot Mirdita <[email protected]>",

src/workflow/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,6 @@ set(workflow_source_files
33
workflow/clustersearch.cpp
44
workflow/aa2foldseek.cpp
55
workflow/clusterdb.cpp
6+
workflow/gff2db.cpp
67
PARENT_SCOPE
78
)

src/workflow/gff2db.cpp

Lines changed: 211 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,211 @@
1+
#include "DBReader.h"
2+
#include "DBWriter.h"
3+
#include "Debug.h"
4+
#include "Util.h"
5+
#include "MemoryMapped.h"
6+
#include "Orf.h"
7+
#include "Parameters.h"
8+
9+
#ifdef OPENMP
10+
#include <omp.h>
11+
#endif
12+
13+
int gff2db(int argc, const char **argv, const Command &command) {
14+
Parameters &par = Parameters::getInstance();
15+
par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0);
16+
17+
std::string outDb = par.filenames.back();
18+
par.filenames.pop_back();
19+
std::string seqDb = par.filenames.back();
20+
par.filenames.pop_back();
21+
22+
DBReader<unsigned int> reader(seqDb.c_str(), (seqDb + ".index").c_str(), par.threads, DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA | DBReader<unsigned int>::USE_LOOKUP_REV);
23+
reader.open(DBReader<unsigned int>::NOSORT);
24+
DBReader<unsigned int> headerReader((seqDb + "_h").c_str(), (seqDb + "_h.index").c_str(), par.threads, DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA);
25+
headerReader.open(DBReader<unsigned int>::NOSORT);
26+
27+
std::string outDbIndex = outDb + ".index";
28+
DBWriter writer(outDb.c_str(), outDbIndex.c_str(), par.threads, par.compressed, Parameters::DBTYPE_NUCLEOTIDES);
29+
writer.open();
30+
std::string outHdr = outDb + "_h";
31+
std::string outHdrIndex = outDb + "_h.index";
32+
DBWriter headerWriter(outHdr.c_str(), outHdrIndex.c_str(), par.threads, par.compressed, Parameters::DBTYPE_GENERIC_DB);
33+
headerWriter.open();
34+
std::string outLookup = outDb + ".lookup";
35+
std::string outLookupIndex = outDb + ".lookup.index";
36+
DBWriter lookupWriter(outLookup.c_str(), outLookupIndex.c_str(), par.threads, par.compressed, Parameters::DBTYPE_OMIT_FILE);
37+
lookupWriter.open();
38+
39+
FILE *source = FileUtil::openAndDelete((outDb + ".source").c_str(), "w");
40+
for (size_t i = 0; i < par.filenames.size(); ++i) {
41+
if (fprintf(source, "%zu\t%s\n", i, FileUtil::baseName(par.filenames[i]).c_str()) < 0) {
42+
Debug(Debug::ERROR) << "Cannot write to file " << outDb << ".source\n";
43+
return EXIT_FAILURE;
44+
}
45+
}
46+
if (fclose(source) != 0) {
47+
Debug(Debug::ERROR) << "Cannot close file " << outDb << ".source\n";
48+
return EXIT_FAILURE;
49+
}
50+
51+
const std::vector<std::string> features = Util::split(par.gffType, ",");
52+
if (features.empty()) {
53+
Debug(Debug::WARNING) << "No feature types given. All features will be extracted\n";
54+
}
55+
std::vector<size_t> featureCount(features.size(), 0);
56+
57+
if (par.filenames.size() < reader.getSize()) {
58+
Debug(Debug::WARNING) << "Not enough GFF files are provided. Some results might be omitted\n";
59+
}
60+
61+
unsigned int entries_num = 0;
62+
Debug::Progress progress(par.filenames.size());
63+
#pragma omp parallel
64+
{
65+
int thread_idx = 0;
66+
#ifdef OPENMP
67+
thread_idx = omp_get_thread_num();
68+
#endif
69+
70+
char buffer[32768];
71+
const char* fields[255];
72+
73+
std::string revStr;
74+
revStr.reserve(par.maxSeqLen + 1);
75+
76+
std::vector<size_t> localFeatureCount(features.size(), 0);
77+
78+
#pragma omp for schedule(dynamic, 1) nowait
79+
for (size_t i = 0; i < par.filenames.size(); ++i) {
80+
progress.updateProgress();
81+
MemoryMapped file(par.filenames[i], MemoryMapped::WholeFile, MemoryMapped::SequentialScan);
82+
if (!file.isValid()) {
83+
Debug(Debug::ERROR) << "Could not open GFF file " << par.filenames[i] << "\n";
84+
EXIT(EXIT_FAILURE);
85+
}
86+
char *data = (char *) file.getData();
87+
char* end = data + file.mappedSize();
88+
size_t idx = 0;
89+
while (data < end && *data != '\0') {
90+
// line is a comment or empty
91+
if (*data == '#' || *data == '\n') {
92+
data = Util::skipLine(data);
93+
continue;
94+
}
95+
96+
const size_t columns = Util::getFieldsOfLine(data, fields, 255);
97+
data = Util::skipLine(data);
98+
if (columns < 9) {
99+
Debug(Debug::WARNING) << "Not enough columns in GFF file\n";
100+
continue;
101+
}
102+
103+
if (features.empty() == false) {
104+
bool shouldSkip = true;
105+
std::string type(fields[2], fields[3] - fields[2] - 1);
106+
for (size_t i = 0; i < features.size(); ++i) {
107+
if (type.compare(features[i]) == 0) {
108+
localFeatureCount[i]++;
109+
shouldSkip = false;
110+
break;
111+
}
112+
}
113+
if (shouldSkip) {
114+
continue;
115+
}
116+
}
117+
size_t start = Util::fast_atoi<size_t>(fields[3]);
118+
size_t end = Util::fast_atoi<size_t>(fields[4]);
119+
if (start == end) {
120+
Debug(Debug::WARNING) << "Invalid sequence length in line " << idx << "\n";
121+
continue;
122+
}
123+
std::string strand(fields[6], fields[7] - fields[6] - 1);
124+
std::string name(fields[0], fields[1] - fields[0] - 1);
125+
size_t lookupId = reader.getLookupIdByAccession(name);
126+
if (lookupId == SIZE_MAX) {
127+
Debug(Debug::ERROR) << "GFF entry not found in database lookup: " << name << "\n";
128+
EXIT(EXIT_FAILURE);
129+
}
130+
unsigned int lookupKey = reader.getLookupKey(lookupId);
131+
size_t seqId = reader.getId(lookupKey);
132+
if (seqId == UINT_MAX) {
133+
Debug(Debug::ERROR) << "GFF entry not found in sequence database: " << name << "\n";
134+
EXIT(EXIT_FAILURE);
135+
}
136+
137+
unsigned int key = __sync_fetch_and_add(&(entries_num), 1);
138+
size_t bufferLen;
139+
if (strand == "+") {
140+
bufferLen = Orf::writeOrfHeader(buffer, lookupKey, start, end, 0, 0);
141+
} else {
142+
bufferLen = Orf::writeOrfHeader(buffer, lookupKey, end, start, 0, 0);
143+
}
144+
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
145+
146+
char *seq = reader.getData(seqId, thread_idx);
147+
//check the strand instead of end - start
148+
size_t length = end - start + 1;
149+
writer.writeStart(thread_idx);
150+
if (strand == "+") {
151+
size_t len = snprintf(buffer, sizeof(buffer), "%u\t%s_%zu_%zu_%zu\t%zu\n", key, name.c_str(), idx, start, end, i);
152+
lookupWriter.writeData(buffer, len, thread_idx, false, false);
153+
writer.writeAdd(seq + start - 1 , length, thread_idx);
154+
} else {
155+
size_t len = snprintf(buffer, sizeof(buffer), "%u\t%s_%zu_%zu_%zu\t%zu\n", key, name.c_str(), idx, end, start, i);
156+
lookupWriter.writeData(buffer, len, thread_idx, false, false);
157+
// safe accessing in reverse order
158+
for (size_t j = 0; j < length; j++) {
159+
revStr.append(1, Orf::complement(seq[end - 1 - j]));
160+
}
161+
162+
writer.writeAdd(revStr.c_str(), revStr.size(), thread_idx);
163+
revStr.clear();
164+
}
165+
writer.writeAdd("\n", 1, thread_idx);
166+
writer.writeEnd(key, thread_idx);
167+
idx++;
168+
}
169+
file.close();
170+
}
171+
#pragma omp critical
172+
for (size_t i = 0; i < features.size(); ++i) {
173+
featureCount[i] += localFeatureCount[i];
174+
}
175+
}
176+
lookupWriter.close(true);
177+
FileUtil::remove(lookupWriter.getIndexFileName());
178+
headerWriter.close(true);
179+
writer.close(true);
180+
headerReader.close();
181+
reader.close();
182+
if (Debug::debugLevel >= Debug::INFO && features.size() > 0) {
183+
Debug(Debug::INFO) << "Found these feature types and counts:\n";
184+
for (size_t i = 0; i < features.size(); ++i) {
185+
Debug(Debug::INFO) << " - " << features[i] << ": " << featureCount[i] << "\n";
186+
}
187+
} else {
188+
Debug(Debug::INFO) << (entries_num + 1) << " features were extracted\n";
189+
}
190+
191+
if (par.filenames.size() > 1 && par.threads > 1) {
192+
// make identifiers stable
193+
#pragma omp parallel
194+
{
195+
#pragma omp single
196+
{
197+
#pragma omp task
198+
{
199+
DBWriter::createRenumberedDB(outHdr, outHdrIndex, "", "");
200+
}
201+
202+
#pragma omp task
203+
{
204+
DBWriter::createRenumberedDB(outDb, outDbIndex, outDb, outDbIndex);
205+
}
206+
}
207+
}
208+
}
209+
210+
return EXIT_SUCCESS;
211+
}

0 commit comments

Comments
 (0)