Skip to content

Commit a0b161f

Browse files
committed
Merge branch 'master' into minor
Manually merges pull request biojava#291
2 parents 46ea99d + fe890f2 commit a0b161f

File tree

9 files changed

+206872
-39
lines changed

9 files changed

+206872
-39
lines changed
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
/*
2+
* BioJava development code
3+
*
4+
* This code may be freely distributed and modified under the
5+
* terms of the GNU Lesser General Public Licence. This should
6+
* be distributed with the code. If you do not have a copy,
7+
* see:
8+
*
9+
* http://www.gnu.org/copyleft/lesser.html
10+
*
11+
* Copyright for this code is held jointly by the individual
12+
* authors. These should be listed in @author doc comments.
13+
*
14+
* For more information on the BioJava project and its aims,
15+
* or to join the biojava-l mailing list, visit the home page
16+
* at:
17+
*
18+
* http://www.biojava.org/
19+
*
20+
*/
21+
package org.biojava.nbio.core.exceptions;
22+
23+
public class Messages {
24+
25+
public static final String ENDOFFILE = "end of file";
26+
public static final String SECTIONKEYNULL = "section key is null";
27+
28+
}

biojava-core/src/main/java/org/biojava/nbio/core/sequence/io/GenbankReader.java

Lines changed: 28 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ public LinkedHashMap<String,S> process() throws IOException, CompoundNotFoundExc
115115
/**
116116
* This method tries to parse maximum <code>max</code> records from
117117
* the open File or InputStream, and leaves the underlying resource open.<br>
118+
*
118119
* Subsequent calls to the same method continue parsing the rest of the file.<br>
119120
* This is particularly useful when dealing with very big data files,
120121
* (e.g. NCBI nr database), which can't fit into memory and will take long
@@ -136,25 +137,35 @@ public LinkedHashMap<String,S> process() throws IOException, CompoundNotFoundExc
136137
public LinkedHashMap<String,S> process(int max) throws IOException, CompoundNotFoundException {
137138
LinkedHashMap<String,S> sequences = new LinkedHashMap<String,S>();
138139
@SuppressWarnings("unchecked")
139-
S sequence = (S) sequenceCreator.getSequence(genbankParser.getSequence(new BufferedReader(new InputStreamReader(inputStream)), 0), 0);
140-
genbankParser.getSequenceHeaderParser().parseHeader(genbankParser.getHeader(), sequence);
140+
int i=0;
141+
BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
142+
while(true) {
143+
if(max>0 && i>=max) break;
144+
i++;
145+
String seqString = genbankParser.getSequence(br, 0);
146+
//reached end of file?
147+
if(seqString==null) break;
148+
S sequence = (S) sequenceCreator.getSequence(seqString, 0);
149+
genbankParser.getSequenceHeaderParser().parseHeader(genbankParser.getHeader(), sequence);
150+
151+
// add features to new sequence
152+
for (String k: genbankParser.getFeatures().keySet()){
153+
for (AbstractFeature f: genbankParser.getFeatures(k)){
154+
//f.getLocations().setSequence(sequence); // can't set proper sequence source to features. It is actually needed? Don't think so...
155+
sequence.addFeature(f);
156+
}
157+
}
141158

142-
// add features to new sequence
143-
for (String k: genbankParser.getFeatures().keySet()){
144-
for (AbstractFeature f: genbankParser.getFeatures(k)){
145-
//f.getLocations().setSequence(sequence); // can't set proper sequence source to features. It is actually needed? Don't think so...
146-
sequence.addFeature(f);
147-
}
159+
// add taxonomy ID to new sequence
160+
ArrayList<DBReferenceInfo> dbQualifier = genbankParser.getDatabaseReferences().get("db_xref");
161+
if (dbQualifier != null){
162+
DBReferenceInfo q = dbQualifier.get(0);
163+
sequence.setTaxonomy(new TaxonomyID(q.getDatabase()+":"+q.getId(), DataSource.GENBANK));
164+
}
165+
166+
sequences.put(sequence.getAccession().getID(), sequence);
148167
}
149-
150-
// add taxonomy ID to new sequence
151-
ArrayList<DBReferenceInfo> dbQualifier = genbankParser.getDatabaseReferences().get("db_xref");
152-
if (dbQualifier != null){
153-
DBReferenceInfo q = dbQualifier.get(0);
154-
sequence.setTaxonomy(new TaxonomyID(q.getDatabase()+":"+q.getId(), DataSource.GENBANK));
155-
}
156-
157-
sequences.put(sequence.getAccession().getID(), sequence);
168+
br.close();
158169
close();
159170
return sequences;
160171
}

biojava-core/src/main/java/org/biojava/nbio/core/sequence/io/GenbankSequenceParser.java

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
*/
3232
package org.biojava.nbio.core.sequence.io;
3333

34+
import org.biojava.nbio.core.exceptions.Messages;
3435
import org.biojava.nbio.core.exceptions.ParserException;
3536
import org.biojava.nbio.core.sequence.DataSource;
3637
import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
@@ -127,6 +128,9 @@ public class GenbankSequenceParser<S extends AbstractSequence<C>, C extends Comp
127128

128129
protected static final Pattern readableFiles = Pattern.compile(".*(g[bp]k*$|\\u002eg[bp].*)");
129130
protected static final Pattern headerLine = Pattern.compile("^LOCUS.*");
131+
private static final String DBSOURCE = "DBSOURCE";
132+
private static final String PRIMARY = "PRIMARY";
133+
private static final String DBLINK = "DBLINK";
130134

131135
// private NCBITaxon tax = null;
132136

@@ -140,7 +144,12 @@ private String parse(BufferedReader bufferedReader) {
140144
section = this.readSection(bufferedReader);
141145
sectionKey = ((String[]) section.get(0))[0];
142146
if (sectionKey == null) {
143-
throw new ParserException("Section key was null");
147+
//if we reach the end of the file, section contains empty strings
148+
if(section.get(0)[1]==null || section.get(0)[1]=="" ||
149+
section.get(0)[1].length()==0) {
150+
throw new ParserException(Messages.ENDOFFILE);
151+
}
152+
throw new ParserException(Messages.SECTIONKEYNULL);
144153
}
145154
// process section-by-section
146155
if (sectionKey.equals(LOCUS_TAG)) {
@@ -279,6 +288,16 @@ private String parse(BufferedReader bufferedReader) {
279288
seq.append(((String[]) section.get(i))[1]);
280289
}
281290
seqData = seq.toString().replaceAll("\\s+", "").replaceAll("[\\.|~]", "-").toUpperCase();
291+
} else if(sectionKey.equals(DBSOURCE)) {
292+
//TODO
293+
} else if(sectionKey.equals(PRIMARY)) {
294+
//TODO
295+
} else if(sectionKey.equals(DBLINK)) {
296+
//TODO
297+
} else {
298+
if(!sectionKey.equals(END_SEQUENCE_TAG)) {
299+
log.info("found unknown section key: "+sectionKey);
300+
}
282301
}
283302
} while (!sectionKey.equals(END_SEQUENCE_TAG));
284303
return seqData;
@@ -295,6 +314,7 @@ private String parse(BufferedReader bufferedReader) {
295314
private List<String[]> readSection(BufferedReader bufferedReader) {
296315
List<String[]> section = new ArrayList<String[]>();
297316
String line = "";
317+
298318
String currKey = null;
299319
StringBuffer currVal = new StringBuffer();
300320
boolean done = false;
@@ -364,8 +384,12 @@ public String getSequence(BufferedReader bufferedReader, int sequenceLength) thr
364384
featureCollection = new HashMap<String, ArrayList<AbstractFeature>>();
365385
mapDB = new LinkedHashMap<String, ArrayList<DBReferenceInfo>>();
366386
headerParser = new GenericGenbankHeaderParser<S, C>();
367-
368-
parse(bufferedReader);
387+
try {
388+
parse(bufferedReader);
389+
} catch (ParserException e) {
390+
if(e.getMessage().equalsIgnoreCase(Messages.ENDOFFILE)) return null;
391+
else throw new ParserException(e.getMessage());
392+
}
369393

370394
return seqData;
371395
}

biojava-core/src/main/java/org/biojava/nbio/core/sequence/io/GenericInsdcHeaderFormat.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -281,7 +281,7 @@ private String _insdc_location_string_ignoring_strand_and_subfeatures(
281281
}
282282
} else {
283283
//Typical case, e.g. 12..15 gets mapped to 11:15
284-
return ref + _insdc_feature_position_string(sequenceLocation.getStart(), 1) + ".." + _insdc_feature_position_string(sequenceLocation.getEnd());
284+
return ref + _insdc_feature_position_string(sequenceLocation.getStart(), 0) + ".." + _insdc_feature_position_string(sequenceLocation.getEnd());
285285
}
286286
}
287287
private String _insdc_feature_position_string(Point location) {

0 commit comments

Comments
 (0)