Skip to content

Commit 40ff96b

Browse files
Karl NicholasKarl Nicholas
authored andcommitted
More work on Genbank files, bringing them inline with interfaces.
Renamed FastaHeaderParserInterface to SequenceHeaderParserInterface. Changed SequenceParserInterface to use BufferedReader instead of DataInput to provide flexibility for GenbankParser. Updated SequenceFileProxyLoader to BufferedReader.
1 parent 4949980 commit 40ff96b

20 files changed

Lines changed: 424 additions & 204 deletions

biojava3-core/src/main/java/org/biojava3/core/sequence/io/FastaReader.java

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
import org.biojava3.core.sequence.ProteinSequence;
3434
import org.biojava3.core.sequence.compound.AminoAcidCompound;
3535
import org.biojava3.core.sequence.compound.AminoAcidCompoundSet;
36-
import org.biojava3.core.sequence.io.template.FastaHeaderParserInterface;
36+
import org.biojava3.core.sequence.io.template.SequenceHeaderParserInterface;
3737
import org.biojava3.core.sequence.io.template.SequenceCreatorInterface;
3838
import org.biojava3.core.sequence.template.Compound;
3939
import org.biojava3.core.sequence.template.Sequence;
@@ -46,7 +46,7 @@
4646
public class FastaReader<S extends Sequence<?>, C extends Compound> {
4747

4848
SequenceCreatorInterface<C> sequenceCreator;
49-
FastaHeaderParserInterface<S,C> headerParser;
49+
SequenceHeaderParserInterface<S,C> headerParser;
5050
BufferedReaderBytesRead br;
5151
InputStreamReader isr;
5252
FileInputStream fi = null;
@@ -63,7 +63,7 @@ public class FastaReader<S extends Sequence<?>, C extends Compound> {
6363
* @param headerParser
6464
* @param sequenceCreator
6565
*/
66-
public FastaReader(InputStream is, FastaHeaderParserInterface<S,C> headerParser,
66+
public FastaReader(InputStream is, SequenceHeaderParserInterface<S,C> headerParser,
6767
SequenceCreatorInterface<C> sequenceCreator) {
6868
this.headerParser = headerParser;
6969
isr = new InputStreamReader(is);
@@ -84,7 +84,7 @@ public FastaReader(InputStream is, FastaHeaderParserInterface<S,C> headerParser,
8484
* @throws SecurityException if a security manager exists and its checkRead
8585
* method denies read access to the file.
8686
*/
87-
public FastaReader(File file, FastaHeaderParserInterface<S,C> headerParser,
87+
public FastaReader(File file, SequenceHeaderParserInterface<S,C> headerParser,
8888
SequenceCreatorInterface<C> sequenceCreator) throws FileNotFoundException {
8989
this.headerParser = headerParser;
9090
fi = new FileInputStream(file);
@@ -216,7 +216,16 @@ public static void main(String[] args) {
216216
System.out.println(proteinSequences);
217217

218218
File file = new File(inputFile);
219-
FastaReader<ProteinSequence,AminoAcidCompound> fastaProxyReader = new FastaReader<ProteinSequence,AminoAcidCompound>(file, new GenericFastaHeaderParser<ProteinSequence,AminoAcidCompound>(), new FileProxyProteinSequenceCreator(file, AminoAcidCompoundSet.getAminoAcidCompoundSet()));
219+
FastaReader<ProteinSequence,AminoAcidCompound> fastaProxyReader =
220+
new FastaReader<ProteinSequence,AminoAcidCompound>(
221+
file,
222+
new GenericFastaHeaderParser<ProteinSequence,AminoAcidCompound>(),
223+
new FileProxyProteinSequenceCreator(
224+
file,
225+
AminoAcidCompoundSet.getAminoAcidCompoundSet(),
226+
new FastaSequenceParser()
227+
)
228+
);
220229
LinkedHashMap<String,ProteinSequence> proteinProxySequences = fastaProxyReader.process();
221230

222231
for(String key : proteinProxySequences.keySet()){

biojava3-core/src/main/java/org/biojava3/core/sequence/io/FastaReaderHelper.java

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
import org.biojava3.core.sequence.compound.AminoAcidCompoundSet;
3333
import org.biojava3.core.sequence.compound.DNACompoundSet;
3434
import org.biojava3.core.sequence.compound.NucleotideCompound;
35+
import org.biojava3.core.sequence.template.AbstractSequence;
3536

3637
/**
3738
*
@@ -53,7 +54,16 @@ public static LinkedHashMap<String, DNASequence> readFastaDNASequence(File file,
5354
return readFastaDNASequence(file);
5455
}
5556

56-
FastaReader<DNASequence, NucleotideCompound> fastaProxyReader = new FastaReader<DNASequence, NucleotideCompound>(file, new GenericFastaHeaderParser<DNASequence, NucleotideCompound>(), new FileProxyDNASequenceCreator(file, DNACompoundSet.getDNACompoundSet()));
57+
FastaReader<DNASequence, NucleotideCompound> fastaProxyReader =
58+
new FastaReader<DNASequence, NucleotideCompound>(
59+
file,
60+
new GenericFastaHeaderParser<DNASequence, NucleotideCompound>(),
61+
new FileProxyDNASequenceCreator(
62+
file,
63+
DNACompoundSet.getDNACompoundSet(),
64+
new FastaSequenceParser()
65+
)
66+
);
5767
return fastaProxyReader.process();
5868

5969
}

biojava3-core/src/main/java/org/biojava3/core/sequence/io/FastaSequenceParser.java

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@
2121
*/
2222
package org.biojava3.core.sequence.io;
2323

24-
import java.io.DataInput;
24+
import java.io.BufferedReader;
25+
2526
import org.biojava3.core.sequence.io.template.SequenceParserInterface;
2627

2728
/**
@@ -30,7 +31,7 @@
3031
*/
3132
public class FastaSequenceParser implements SequenceParserInterface {
3233

33-
public String getSequence(DataInput dataInput, int sequenceLength) throws Exception {
34+
public String getSequence(BufferedReader bufferedReader, int sequenceLength) throws Exception {
3435
StringBuilder sb;
3536
if (sequenceLength != -1) {
3637
sb = new StringBuilder(sequenceLength);
@@ -39,7 +40,7 @@ public String getSequence(DataInput dataInput, int sequenceLength) throws Except
3940
}
4041
boolean keepGoing = true;
4142
while (keepGoing) {
42-
String line = dataInput.readLine();
43+
String line = bufferedReader.readLine();
4344
if (line == null || line.startsWith(">")) {
4445
break;
4546
}

biojava3-core/src/main/java/org/biojava3/core/sequence/io/FileProxyDNASequenceCreator.java

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import org.biojava3.core.sequence.DNASequence;
2828
import org.biojava3.core.sequence.compound.NucleotideCompound;
2929
import org.biojava3.core.sequence.io.template.SequenceCreatorInterface;
30+
import org.biojava3.core.sequence.io.template.SequenceParserInterface;
3031
import org.biojava3.core.sequence.loader.SequenceFileProxyLoader;
3132
import org.biojava3.core.sequence.template.AbstractSequence;
3233
import org.biojava3.core.sequence.template.CompoundSet;
@@ -48,17 +49,20 @@ public class FileProxyDNASequenceCreator implements
4849
SequenceCreatorInterface<NucleotideCompound> {
4950

5051
CompoundSet<NucleotideCompound> compoundSet = null;
51-
File fastaFile = null;
52+
File file = null;
53+
SequenceParserInterface sequenceParser;
5254

5355
/**
5456
* Need File so that we can store full path name in SequenceFileProxyLoader for Random File access as a quick read
5557
* @param fastaFile
5658
* @param compoundSet
5759
*/
58-
public FileProxyDNASequenceCreator(File fastaFile,
59-
CompoundSet<NucleotideCompound> compoundSet) {
60+
public FileProxyDNASequenceCreator(File file,
61+
CompoundSet<NucleotideCompound> compoundSet,
62+
SequenceParserInterface sequenceParser) {
6063
this.compoundSet = compoundSet;
61-
this.fastaFile = fastaFile;
64+
this.file = file;
65+
this.sequenceParser = sequenceParser;
6266
}
6367

6468
/**
@@ -69,10 +73,12 @@ public FileProxyDNASequenceCreator(File fastaFile,
6973
* @return
7074
*/
7175

72-
public AbstractSequence<NucleotideCompound> getSequence(String sequence,
73-
long index) {
76+
public AbstractSequence<NucleotideCompound> getSequence(String sequence, long index ) {
7477
SequenceFileProxyLoader<NucleotideCompound> sequenceFileProxyLoader = new SequenceFileProxyLoader<NucleotideCompound>(
75-
fastaFile, new FastaSequenceParser(), index, sequence.length(),
78+
file,
79+
sequenceParser,
80+
index,
81+
sequence.length(),
7682
compoundSet);
7783
return new DNASequence(sequenceFileProxyLoader, compoundSet);
7884
}

biojava3-core/src/main/java/org/biojava3/core/sequence/io/FileProxyProteinSequenceCreator.java

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import org.biojava3.core.sequence.ProteinSequence;
2828
import org.biojava3.core.sequence.compound.AminoAcidCompound;
2929
import org.biojava3.core.sequence.io.template.SequenceCreatorInterface;
30+
import org.biojava3.core.sequence.io.template.SequenceParserInterface;
3031
import org.biojava3.core.sequence.loader.SequenceFileProxyLoader;
3132
import org.biojava3.core.sequence.template.AbstractSequence;
3233
import org.biojava3.core.sequence.template.CompoundSet;
@@ -44,21 +45,21 @@
4445
*
4546
* @author Scooter Willis &lt;willishf at gmail dot com&gt;
4647
*/
47-
public class FileProxyProteinSequenceCreator implements
48-
SequenceCreatorInterface<AminoAcidCompound> {
49-
50-
CompoundSet<AminoAcidCompound> compoundSet = null;
51-
File fastaFile = null;
48+
public class FileProxyProteinSequenceCreator implements SequenceCreatorInterface<AminoAcidCompound> {
5249

50+
CompoundSet<AminoAcidCompound> compoundSet;
51+
File file;
52+
SequenceParserInterface sequenceParser;
53+
5354
/**
5455
* Need File so that we can store full path name in SequenceFileProxyLoader for Random File access as a quick read
5556
* @param fastaFile
5657
* @param compoundSet
5758
*/
58-
public FileProxyProteinSequenceCreator(File fastaFile,
59-
CompoundSet<AminoAcidCompound> compoundSet) {
59+
public FileProxyProteinSequenceCreator(File file, CompoundSet<AminoAcidCompound> compoundSet, SequenceParserInterface sequenceParser ) {
6060
this.compoundSet = compoundSet;
61-
this.fastaFile = fastaFile;
61+
this.file = file;
62+
this.sequenceParser = sequenceParser;
6263
}
6364

6465
/**
@@ -69,11 +70,15 @@ public FileProxyProteinSequenceCreator(File fastaFile,
6970
* @return
7071
*/
7172

72-
public AbstractSequence<AminoAcidCompound> getSequence(String sequence,
73-
long index) {
74-
SequenceFileProxyLoader<AminoAcidCompound> sequenceFileProxyLoader = new SequenceFileProxyLoader<AminoAcidCompound>(
75-
fastaFile, new FastaSequenceParser(), index, sequence.length(),
76-
compoundSet);
73+
public AbstractSequence<AminoAcidCompound> getSequence(String sequence, long index) {
74+
SequenceFileProxyLoader<AminoAcidCompound> sequenceFileProxyLoader =
75+
new SequenceFileProxyLoader<AminoAcidCompound>(
76+
file,
77+
sequenceParser,
78+
index,
79+
sequence.length(),
80+
compoundSet
81+
);
7782
return new ProteinSequence(sequenceFileProxyLoader, compoundSet);
7883
}
7984

biojava3-core/src/main/java/org/biojava3/core/sequence/io/GenbankReader.java

Lines changed: 34 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,10 @@
1010
*
1111
* Copyright for this code is held jointly by the individual
1212
* authors. These should be listed in @author doc comments.
13-
*
13+
*
14+
* @author Scooter Willis ;lt;willishf at gmail dot com&gt;
15+
* @author Karl Nicholas <github:karlnicholas>
16+
*
1417
* For more information on the BioJava project and its aims,
1518
* or to join the biojava-l mailing list, visit the home page
1619
* at:
@@ -21,11 +24,13 @@
2124
*/
2225
package org.biojava3.core.sequence.io;
2326

27+
import java.io.BufferedReader;
2428
import java.io.File;
2529
import java.io.FileInputStream;
2630
import java.io.FileNotFoundException;
2731
import java.io.IOException;
2832
import java.io.InputStream;
33+
import java.io.InputStreamReader;
2934
import java.util.HashMap;
3035
import java.util.LinkedHashMap;
3136

@@ -35,26 +40,21 @@
3540
import org.biojava3.core.sequence.compound.AminoAcidCompoundSet;
3641
import org.biojava3.core.sequence.compound.DNACompoundSet;
3742
import org.biojava3.core.sequence.compound.NucleotideCompound;
38-
import org.biojava3.core.sequence.io.template.GenbankHeaderParserInterface;
3943
import org.biojava3.core.sequence.io.template.SequenceCreatorInterface;
44+
import org.biojava3.core.sequence.io.template.SequenceHeaderParserInterface;
45+
import org.biojava3.core.sequence.template.AbstractSequence;
4046
import org.biojava3.core.sequence.template.Compound;
41-
import org.biojava3.core.sequence.template.Sequence;
4247

4348
/**
4449
* Use GenbankReaderHelper as an example of how to use this class where GenbankReaderHelper should be the
4550
* primary class used to read Genbank files
46-
* -- copied from original FastReader by Scooter Willis ;lt;willishf at gmail dot com&gt;
47-
* @author Karl Nicholas
48-
51+
*
4952
*/
50-
public class GenbankReader<S extends Sequence<?>, C extends Compound> {
53+
public class GenbankReader<S extends AbstractSequence<C>, C extends Compound> {
5154

52-
SequenceCreatorInterface<C> sequenceCreator;
53-
GenbankHeaderParserInterface<S,C> headerParser;
54-
FileInputStream fi = null;
55-
long fileIndex = 0;
56-
long sequenceIndex = 0;
57-
GenbankParser<S,C> genbankParser;
55+
private SequenceCreatorInterface<C> sequenceCreator;
56+
private GenbankSequenceParser<S,C> genbankParser;
57+
private InputStream inputStream;
5858

5959
/**
6060
* If you are going to use FileProxyProteinSequenceCreator then do not use this constructor because we need details about
@@ -64,11 +64,10 @@ public class GenbankReader<S extends Sequence<?>, C extends Compound> {
6464
* @param headerParser
6565
* @param sequenceCreator
6666
*/
67-
public GenbankReader(InputStream is, GenbankHeaderParserInterface<S,C> headerParser,
68-
SequenceCreatorInterface<C> sequenceCreator) {
69-
this.headerParser = headerParser;
67+
public GenbankReader(InputStream is, SequenceHeaderParserInterface<S,C> headerParser, SequenceCreatorInterface<C> sequenceCreator) {
7068
this.sequenceCreator = sequenceCreator;
71-
genbankParser = new GenbankParser<S,C>(is, headerParser);
69+
this.inputStream = is;
70+
genbankParser = new GenbankSequenceParser<S,C>();
7271
}
7372

7473
/**
@@ -84,12 +83,15 @@ public GenbankReader(InputStream is, GenbankHeaderParserInterface<S,C> headerPar
8483
* @throws SecurityException if a security manager exists and its checkRead
8584
* method denies read access to the file.
8685
*/
87-
public GenbankReader(File file, GenbankHeaderParserInterface<S,C> headerParser,
88-
SequenceCreatorInterface<C> sequenceCreator) throws FileNotFoundException {
89-
this.headerParser = headerParser;
90-
fi = new FileInputStream(file);
86+
public GenbankReader(
87+
File file,
88+
SequenceHeaderParserInterface<S,C> headerParser,
89+
SequenceCreatorInterface<C> sequenceCreator
90+
) throws FileNotFoundException {
91+
92+
inputStream = new FileInputStream(file);
9193
this.sequenceCreator = sequenceCreator;
92-
genbankParser = new GenbankParser<S,C>(fi, headerParser);
94+
genbankParser = new GenbankSequenceParser<S,C>();
9395
}
9496

9597
/**
@@ -101,11 +103,10 @@ public GenbankReader(File file, GenbankHeaderParserInterface<S,C> headerParser,
101103
* @see #process(int)
102104
* @return {@link HashMap} containing all the parsed Genbank records
103105
* present, starting current fileIndex onwards.
104-
* @throws IOException if an error occurs reading the input file
106+
* @throws Exception
105107
*/
106-
public LinkedHashMap<String,S> process() throws IOException {
108+
public LinkedHashMap<String,S> process() throws Exception {
107109
LinkedHashMap<String,S> sequences = process(-1);
108-
close();
109110
return sequences;
110111
}
111112

@@ -127,22 +128,20 @@ public LinkedHashMap<String,S> process() throws IOException {
127128
* @param max maximum number of records to return, <code>-1</code> for infinity.
128129
* @return {@link HashMap} containing maximum <code>max</code> parsed Genbank records
129130
* present, starting current fileIndex onwards.
130-
* @throws IOException if an error occurs reading the input file
131+
* @throws Exception
131132
*/
132-
public LinkedHashMap<String,S> process(int max) throws IOException {
133+
public LinkedHashMap<String,S> process(int max) throws Exception {
133134
LinkedHashMap<String,S> sequences = new LinkedHashMap<String,S>();
134-
genbankParser.parse();
135-
S sequence = genbankParser.getSequence(sequenceCreator);
135+
@SuppressWarnings("unchecked")
136+
S sequence = (S) sequenceCreator.getSequence(genbankParser.getSequence(new BufferedReader(new InputStreamReader(inputStream)), 0), 0);
137+
genbankParser.getSequenceHeaderParser().parseHeader(genbankParser.getHeader(), sequence);
136138
sequences.put(sequence.getAccession().getID(), sequence);
139+
close();
137140
return sequences;
138-
139141
}
140142

141143
public void close() throws IOException {
142-
//If stream was created from File object then we need to close it
143-
if (fi != null) {
144-
fi.close();
145-
}
144+
inputStream.close();
146145
}
147146

148147
public static void main(String[] args) throws Exception {
@@ -151,7 +150,6 @@ public static void main(String[] args) throws Exception {
151150

152151
GenbankReader<ProteinSequence, AminoAcidCompound> proteinReader = new GenbankReader<ProteinSequence, AminoAcidCompound>(is, new GenericGenbankHeaderParser<ProteinSequence,AminoAcidCompound>(), new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet()));
153152
LinkedHashMap<String,ProteinSequence> proteinSequences = proteinReader.process();
154-
is.close();
155153
System.out.println(proteinSequences);
156154

157155
String inputFile = "src/test/resources/NM_000266.gb";
@@ -160,7 +158,7 @@ public static void main(String[] args) throws Exception {
160158
LinkedHashMap<String,DNASequence> dnaSequences = dnaReader.process();
161159
is.close();
162160
System.out.println(dnaSequences);
163-
164161
}
162+
165163
}
166164

0 commit comments

Comments
 (0)