From ea552b4d09e157ea067d25fb08cbd80e67ead63c Mon Sep 17 00:00:00 2001 From: Bogdan Date: Wed, 15 Jul 2015 14:52:56 +0200 Subject: [PATCH] URLs for MAXCONTIGIDLEN limit I've hit a 20-character limit on auto-generated contig IDs in the released prokka 1.11 archive, and started looking for the source of this limit. I have only managed to find 2 SeqID length limits: 25 in Sequin documentation, and 41 in the "annotation pipeline readme"; NCBI's sample GenBank record (no longer?) mentions any specific limits on LOCUS name length. It might be useful to have some limit-related URLs next to the limit definition, so that it is easier to set a new reasonable limit when the default is not suitable for some reason. --- bin/prokka | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bin/prokka b/bin/prokka index d49a03e..a569dab 100755 --- a/bin/prokka +++ b/bin/prokka @@ -51,7 +51,11 @@ my $PROKKA_DOI = '10.1093/bioinformatics/btu153'; my $DBDIR = "$FindBin::RealBin/../db"; my $HYPO = 'hypothetical protein'; my $UNANN = 'unannotated protein'; -my $MAXCONTIGIDLEN = 37; # Genbank rule +# http://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html#LocusB : no explicit limits on SeqID length +# http://www.ncbi.nlm.nih.gov/books/NBK53702/ : recommends to have sequence IDs shorter than 25 characters +# http://www.ncbi.nlm.nih.gov/Sequin/sequin.hlp.html#FASTAFormatforNucleotideSequences : "Please limit the SeqID to 25 characters or less." +# http://www.ncbi.nlm.nih.gov/genomes/static/Annotation_pipeline_README.txt : the entire "gnl|center|" string must not be longer than 41 characters +my $MAXCONTIGIDLEN = 37; my @LOG; # buffer up log lines before we have log file ready # these should accept .faa on STDIN and write report to STDOUT