From ea552b4d09e157ea067d25fb08cbd80e67ead63c Mon Sep 17 00:00:00 2001
From: Bogdan <to.bogdan@gmail.com>
Date: Wed, 15 Jul 2015 14:52:56 +0200
Subject: [PATCH] URLs for MAXCONTIGIDLEN limit

I've hit a 20-character limit on auto-generated contig IDs in the released prokka 1.11 archive, and started looking for the source of this limit.
I have only managed to find 2 SeqID length limits: 25 in Sequin documentation, and 41 in the "annotation pipeline readme"; NCBI's sample GenBank record (no longer?) mentions any specific limits on LOCUS name length.

It might be useful to have some limit-related URLs next to the limit definition, so that it is easier to set a new reasonable limit when the default is not suitable for some reason.
---
 bin/prokka | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/bin/prokka b/bin/prokka
index d49a03e..a569dab 100755
--- a/bin/prokka
+++ b/bin/prokka
@@ -51,7 +51,11 @@ my $PROKKA_DOI = '10.1093/bioinformatics/btu153';
 my $DBDIR = "$FindBin::RealBin/../db";
 my $HYPO = 'hypothetical protein';
 my $UNANN = 'unannotated protein';
-my $MAXCONTIGIDLEN = 37;  # Genbank rule
+# http://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html#LocusB : no explicit limits on SeqID length
+# http://www.ncbi.nlm.nih.gov/books/NBK53702/ : recommends to have sequence IDs shorter than 25 characters
+# http://www.ncbi.nlm.nih.gov/Sequin/sequin.hlp.html#FASTAFormatforNucleotideSequences : "Please limit the SeqID to 25 characters or less."
+# http://www.ncbi.nlm.nih.gov/genomes/static/Annotation_pipeline_README.txt : the entire "gnl|center|<ID1>" string must not be longer than 41 characters
+my $MAXCONTIGIDLEN = 37;
 my @LOG; # buffer up log lines before we have log file ready
 
 # these should accept .faa on STDIN and write report to STDOUT