Skip to content

Commit 6cbbec5

Browse files
authored
Merge branch 'master' into uniprot-sequence-space-fix
2 parents f279dca + 5d79d90 commit 6cbbec5

31 files changed

Lines changed: 3671 additions & 46 deletions

File tree

.gitattributes

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@
3737
*.sto text
3838
*.tsv text
3939
*.txt text
40-
*.xml text eol=lf #Causing decompression test to fail when line endings in org/biojava/nbio/core/util/build.xml are crlf
40+
# eol=elf : Causing decompression test to fail when line endings in org/biojava/nbio/core/util/build.xml are crlf
41+
*.xml text eol=lf
4142
*.xsd text
4243
*.yml text
4344

biojava-core/src/main/java/org/biojava/nbio/core/sequence/io/GenbankSequenceParser.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ public class GenbankSequenceParser<S extends AbstractSequence<C>, C extends Comp
109109
protected static final String START_SEQUENCE_TAG = "ORIGIN";
110110
protected static final String END_SEQUENCE_TAG = "//";
111111
// locus line
112-
protected static final Pattern lp = Pattern.compile("^(\\S+)\\s+\\d+\\s+(bp|aa)\\s{1,4}(([dms]s-)?(\\S+))?\\s+(circular|linear)?\\s*(\\S+)?\\s*(\\S+)?$");
112+
protected static final Pattern lp = Pattern.compile("^(\\S+)\\s+\\d+\\s+(bp|BP|aa|AA)\\s{0,4}(([dmsDMS][sS]-)?(\\S+))?\\s*(circular|CIRCULAR|linear|LINEAR)?\\s*(\\S+)?\\s*(\\S+)?$");
113113
// version line
114114
protected static final Pattern vp = Pattern.compile("^(\\S*?)(\\.(\\d+))?(\\s+GI:(\\S+))?$");
115115
// reference line
@@ -164,9 +164,9 @@ private String parse(BufferedReader bufferedReader) {
164164
String lengthUnits = m.group(2);
165165
String type = m.group(5);
166166

167-
if (lengthUnits.equals("aa")) {
167+
if (lengthUnits.equalsIgnoreCase("aa")) {
168168
compoundType = AminoAcidCompoundSet.getAminoAcidCompoundSet();
169-
} else if (lengthUnits.equals("bp")) {
169+
} else if (lengthUnits.equalsIgnoreCase("bp")) {
170170
if (type != null) {
171171
if (type.contains("RNA")) {
172172
compoundType = RNACompoundSet.getRNACompoundSet();

biojava-core/src/main/java/org/biojava/nbio/core/sequence/loader/UniprotProxySequenceReader.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,13 +142,15 @@ public void setCompoundSet(CompoundSet<C> compoundSet) {
142142

143143
/**
144144
* Once the sequence is retrieved set the contents and make sure everything this is valid
145+
* Some uniprot records contain white space in the sequence. We must strip it out so setContents doesn't fail.
145146
* @param sequence
146147
* @throws CompoundNotFoundException
147148
*/
148149
@Override
149150
public void setContents(String sequence) throws CompoundNotFoundException {
150151
// Horrendously inefficient - pretty much the way the old BJ did things.
151152
// TODO Should be optimised.
153+
// NOTE This chokes on whitespace in the sequence, so whitespace is stripped
152154
this.sequence = sequence.replaceAll("\\s", "").trim();
153155
this.parsedCompounds.clear();
154156
for (int i = 0; i < this.sequence.length();) {

biojava-core/src/test/java/org/biojava/nbio/core/sequence/io/GenbankReaderTest.java

Lines changed: 106 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,12 @@
2222

2323
import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
2424
import org.biojava.nbio.core.sequence.DNASequence;
25+
import org.biojava.nbio.core.sequence.RNASequence;
2526
import org.biojava.nbio.core.sequence.ProteinSequence;
2627
import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
2728
import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
2829
import org.biojava.nbio.core.sequence.compound.DNACompoundSet;
30+
import org.biojava.nbio.core.sequence.compound.RNACompoundSet;
2931
import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
3032
import org.biojava.nbio.core.sequence.features.FeatureInterface;
3133
import org.biojava.nbio.core.sequence.features.Qualifier;
@@ -40,8 +42,10 @@
4042
import org.slf4j.LoggerFactory;
4143

4244
import java.io.BufferedInputStream;
45+
import java.io.BufferedReader;
4346
import java.io.IOException;
4447
import java.io.InputStream;
48+
import java.io.InputStreamReader;
4549
import java.util.ArrayList;
4650
import java.util.LinkedHashMap;
4751
import java.util.List;
@@ -230,34 +234,60 @@ public void CDStest() throws Exception {
230234

231235
}
232236

233-
private DNASequence readGenbankResource(final String resource) throws Exception {
234-
DNASequence sequence = null;
235-
InputStream inputStream = null;
236-
try {
237-
inputStream = getClass().getResourceAsStream(resource);
238-
239-
GenbankReader<DNASequence, NucleotideCompound> genbankDNA
240-
= new GenbankReader<>(
241-
inputStream,
242-
new GenericGenbankHeaderParser<>(),
243-
new DNASequenceCreator(DNACompoundSet.getDNACompoundSet())
244-
);
245-
LinkedHashMap<String, DNASequence> dnaSequences = genbankDNA.process();
246-
sequence = dnaSequences.values().iterator().next();
247-
}
248-
finally {
249-
try {
250-
inputStream.close();
251-
}
252-
catch (Exception e) {
253-
// ignore
254-
}
237+
private DNASequence readGenbankResource(final String resource) throws IOException, CompoundNotFoundException {
238+
InputStream inputStream = getClass().getResourceAsStream(resource);
239+
GenbankReader<DNASequence, NucleotideCompound> genbankDNA
240+
= new GenbankReader<>(
241+
inputStream,
242+
new GenericGenbankHeaderParser<>(),
243+
new DNASequenceCreator(DNACompoundSet.getDNACompoundSet())
244+
);
245+
LinkedHashMap<String, DNASequence> dnaSequences = genbankDNA.process();
246+
return dnaSequences.values().iterator().next();
247+
}
248+
249+
private RNASequence readGenbankRNAResource(final String resource) throws IOException, CompoundNotFoundException {
250+
InputStream inputStream = getClass().getResourceAsStream(resource);
251+
GenbankReader<RNASequence, NucleotideCompound> genbankRNA
252+
= new GenbankReader<>(
253+
inputStream,
254+
new GenericGenbankHeaderParser<>(),
255+
new RNASequenceCreator(RNACompoundSet.getRNACompoundSet())
256+
);
257+
LinkedHashMap<String, RNASequence> rnaSequences = genbankRNA.process();
258+
return rnaSequences.values().iterator().next();
259+
}
260+
261+
private ProteinSequence readGenbankProteinResource(final String resource) throws IOException, CompoundNotFoundException {
262+
InputStream inputStream = getClass().getResourceAsStream(resource);
263+
GenbankReader<ProteinSequence, AminoAcidCompound> genbankProtein
264+
= new GenbankReader<>(
265+
inputStream,
266+
new GenericGenbankHeaderParser<>(),
267+
new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet())
268+
);
269+
LinkedHashMap<String, ProteinSequence> proteinSequences = genbankProtein.process();
270+
return proteinSequences.values().iterator().next();
271+
}
272+
273+
private AbstractSequence<?> readUnknownGenbankResource(final String resource) throws IOException, CompoundNotFoundException {
274+
InputStream inputStream = getClass().getResourceAsStream(resource);
275+
GenbankSequenceParser genbankParser = new GenbankSequenceParser();
276+
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream));
277+
String seqString = genbankParser.getSequence(bufferedReader, 0);
278+
String compoundSet = genbankParser.getCompoundType().getClass().getSimpleName();
279+
280+
if (compoundSet.equals("AminoAcidCompoundSet")) {
281+
return readGenbankProteinResource(resource);
282+
} else if (compoundSet.equals("RNACompoundSet")) {
283+
return readGenbankRNAResource(resource);
284+
} else {
285+
return readGenbankResource(resource);
255286
}
256-
return sequence;
257287
}
258288

259289
@Test
260-
public void testNcbiExpandedAccessionFormats() throws Exception {
290+
public void testNcbiExpandedAccessionFormats() throws IOException, CompoundNotFoundException {
261291
DNASequence header0 = readGenbankResource("/empty_header0.gb");
262292
assertEquals("CP032762 5868661 bp DNA circular BCT 15-OCT-2018", header0.getOriginalHeader());
263293

@@ -267,6 +297,58 @@ public void testNcbiExpandedAccessionFormats() throws Exception {
267297
DNASequence header2 = readGenbankResource("/empty_header2.gb");
268298
assertEquals("AZZZAA02123456789 10000000000 bp DNA linear PRI 15-OCT-2018", header2.getOriginalHeader());
269299
}
300+
301+
@Test
302+
public void testLegacyLocusCompatable() throws IOException, CompoundNotFoundException {
303+
304+
// Testing opening a genbank file with uppercase units, strand and topology
305+
AbstractSequence header0 = readUnknownGenbankResource("/org/biojava/nbio/core/sequence/io/uppercase_locus0.gb");
306+
assertEquals("ABC12.3_DE 7071 BP DS-DNA CIRCULAR SYN 22-JUL-1994", header0.getOriginalHeader());
307+
assertEquals("ABC12.3_DE", header0.getAccession().getID());
308+
assertEquals("DNACompoundSet", header0.getCompoundSet().getClass().getSimpleName());
309+
310+
// Testing uppercase SS strand
311+
AbstractSequence header1 = readUnknownGenbankResource("/org/biojava/nbio/core/sequence/io//uppercase_locus1.gb");
312+
assertEquals("ABC12.3_DE 7071 BP SS-DNA CIRCULAR SYN 13-JUL-1994", header1.getOriginalHeader());
313+
assertEquals("ABC12.3_DE", header1.getAccession().getID());
314+
assertEquals("DNACompoundSet", header0.getCompoundSet().getClass().getSimpleName());
315+
316+
// Testing uppercase MS strand
317+
AbstractSequence header2 = readUnknownGenbankResource("/org/biojava/nbio/core/sequence/io//uppercase_locus2.gb");
318+
assertEquals("ABC12.3_DE 7071 BP MS-DNA CIRCULAR SYN 13-JUL-1994", header2.getOriginalHeader());
319+
assertEquals("ABC12.3_DE", header2.getAccession().getID());
320+
assertEquals("DNACompoundSet", header0.getCompoundSet().getClass().getSimpleName());
321+
322+
// Testing uppercase LINEAR topology
323+
AbstractSequence header3 = readUnknownGenbankResource("/org/biojava/nbio/core/sequence/io//uppercase_locus3.gb");
324+
assertEquals("ABC12.3_DE 7071 BP DNA LINEAR SYN 22-JUL-1994", header3.getOriginalHeader());
325+
assertEquals("ABC12.3_DE", header3.getAccession().getID());
326+
assertEquals("DNACompoundSet", header0.getCompoundSet().getClass().getSimpleName());
327+
328+
// Testing uppercase units with no strand or topology
329+
AbstractSequence header4 = readUnknownGenbankResource("/org/biojava/nbio/core/sequence/io//uppercase_locus4.gb");
330+
assertEquals("ABC12.3_DE 7071 BP RNA SYN 13-JUL-1994", header4.getOriginalHeader());
331+
assertEquals("ABC12.3_DE", header4.getAccession().getID());
332+
assertEquals("RNACompoundSet", header4.getCompoundSet().getClass().getSimpleName());
333+
334+
// Testing uppercase units with no strand, topology, division or date
335+
AbstractSequence header5 = readUnknownGenbankResource("/org/biojava/nbio/core/sequence/io//uppercase_locus5.gb");
336+
assertEquals("ABC12.3_DE 7071 BP DNA", header5.getOriginalHeader());
337+
assertEquals("ABC12.3_DE", header5.getAccession().getID());
338+
339+
// Testing uppercase units with no strand, molecule type, topology, division or date
340+
AbstractSequence header6 = readUnknownGenbankResource("/org/biojava/nbio/core/sequence/io//uppercase_locus6.gb");
341+
assertEquals("ABC12.3_DE 7071 BP", header6.getOriginalHeader());
342+
assertEquals("ABC12.3_DE", header6.getAccession().getID());
343+
assertEquals("DNACompoundSet", header0.getCompoundSet().getClass().getSimpleName());
344+
345+
// Testing uppercase protein units
346+
AbstractSequence header7 = readUnknownGenbankResource("/org/biojava/nbio/core/sequence/io//uppercase_locus7.gb");
347+
assertEquals("ABC12.3_DE 7071 AA Protein", header7.getOriginalHeader());
348+
assertEquals("ABC12.3_DE", header7.getAccession().getID());
349+
assertEquals("AminoAcidCompoundSet", header7.getCompoundSet().getClass().getSimpleName());
350+
351+
}
270352

271353
/**
272354
* Helper class to be able to verify the closed state of the input stream.

biojava-core/src/test/java/org/biojava/nbio/core/sequence/loader/GenbankProxySequenceReaderTest.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,10 @@ so it should be done here (manualy).
169169
Assert.assertTrue(!codedBy.isEmpty());
170170
logger.info("\t\tcoded_by: {}", codedBy);
171171
}
172+
173+
// genbank has limits on requests per second, we need to give it some time for next test or otherwise we get 429 http error codes - JD 2018-12-14
174+
// See https://github.com/biojava/biojava/issues/837
175+
Thread.sleep(500);
172176
}
173177

174178
@Test
@@ -207,5 +211,10 @@ public void testProteinSequenceFactoring() throws Exception {
207211
} else {
208212
logger.info("target {} has no CDS", gi);
209213
}
214+
215+
// genbank has limits on requests per second, we need to give it some time for next test or otherwise we get 429 http error codes - JD 2018-12-14
216+
// See https://github.com/biojava/biojava/issues/837
217+
Thread.sleep(500);
218+
210219
}
211220
}
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
LOCUS ABC12.3_DE 7071 BP DS-DNA CIRCULAR SYN 22-JUL-1994
2+
DEFINITION -
3+
KEYWORDS -
4+
SOURCE -
5+
FEATURES Location/Qualifiers
6+
ORIGIN
7+
//
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
LOCUS ABC12.3_DE 7071 BP SS-DNA CIRCULAR SYN 13-JUL-1994
2+
DEFINITION -
3+
KEYWORDS -
4+
SOURCE -
5+
FEATURES Location/Qualifiers
6+
ORIGIN
7+
//
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
LOCUS ABC12.3_DE 7071 BP MS-DNA CIRCULAR SYN 13-JUL-1994
2+
DEFINITION -
3+
KEYWORDS -
4+
SOURCE -
5+
FEATURES Location/Qualifiers
6+
ORIGIN
7+
//
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
LOCUS ABC12.3_DE 7071 BP DNA LINEAR SYN 22-JUL-1994
2+
DEFINITION -
3+
TITLE -
4+
FEATURES Location/Qualifiers
5+
ORIGIN
6+
//
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
LOCUS ABC12.3_DE 7071 BP RNA SYN 13-JUL-1994
2+
DEFINITION -
3+
TITLE -
4+
FEATURES Location/Qualifiers
5+
ORIGIN
6+
//

0 commit comments

Comments
 (0)