Skip to content

Commit fd1a916

Browse files
committed
Enhance UniprotProxySequenceReader to be able to parse Uniprot when XML
record is passed as a string. This is useful if parsing the a large subset of Uniprot which has been previously downloaded (the Swiss-Prot xml.gz file, for example). Also support extracting Uniprot accessions.
1 parent 6284b34 commit fd1a916

1 file changed

Lines changed: 52 additions & 2 deletions

File tree

biojava3-core/src/main/java/org/biojava3/core/sequence/loader/UniprotProxySequenceReader.java

Lines changed: 52 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,10 +55,8 @@
5555
import org.biojava3.core.sequence.template.SequenceMixin;
5656
import org.biojava3.core.sequence.template.SequenceView;
5757
import org.biojava3.core.util.XMLHelper;
58-
5958
import org.slf4j.Logger;
6059
import org.slf4j.LoggerFactory;
61-
6260
import org.w3c.dom.Document;
6361
import org.w3c.dom.Element;
6462

@@ -98,6 +96,37 @@ public UniprotProxySequenceReader(String accession, CompoundSet<C> compoundSet)
9896
setContents(seq);
9997
}
10098

99+
/**
100+
* The xml is passed in as a DOM object so we know everything about the protein.
101+
* If an error occurs throw an exception. We could have a bad uniprot id
102+
* @param document
103+
* @param compoundSet
104+
* @throws Exception
105+
*/
106+
public UniprotProxySequenceReader(Document document, CompoundSet<C> compoundSet) throws Exception {
107+
setCompoundSet(compoundSet);
108+
uniprotDoc = document;
109+
String seq = this.getSequence(uniprotDoc);
110+
setContents(seq);
111+
}
112+
/**
113+
* The passed in xml is parsed as a DOM object so we know everything about the protein.
114+
* If an error occurs throw an exception. We could have a bad uniprot id
115+
* @param xml
116+
* @param compoundSet
117+
* @return UniprotProxySequenceReader
118+
* @throws Exception
119+
*/
120+
public static <C extends Compound> UniprotProxySequenceReader<C> parseUniprotXMLString(String xml, CompoundSet<C> compoundSet) {
121+
try {
122+
Document document = XMLHelper.inputStreamToDocument(new ByteArrayInputStream(xml.getBytes()));
123+
return new UniprotProxySequenceReader<C>(document, compoundSet);
124+
} catch (Exception e) {
125+
logger.error("Exception on xml parse of: {}", xml);
126+
}
127+
return null;
128+
}
129+
101130
public void setCompoundSet(CompoundSet<C> compoundSet) {
102131
this.compoundSet = compoundSet;
103132
}
@@ -254,6 +283,27 @@ public AccessionID getAccession() {
254283
return accessionID;
255284
}
256285

286+
/**
287+
* Pull uniprot accessions associated with this sequence
288+
* @return
289+
* @throws Exception
290+
*/
291+
public ArrayList<AccessionID> getAccessions() throws Exception {
292+
ArrayList<AccessionID> accessionList = new ArrayList<AccessionID>();
293+
if (uniprotDoc == null) {
294+
return accessionList;
295+
}
296+
Element uniprotElement = uniprotDoc.getDocumentElement();
297+
Element entryElement = XMLHelper.selectSingleElement(uniprotElement, "entry");
298+
ArrayList<Element> keyWordElementList = XMLHelper.selectElements(entryElement, "accession");
299+
for (Element element : keyWordElementList) {
300+
AccessionID accessionID = new AccessionID(element.getTextContent(), DataSource.UNIPROT);
301+
accessionList.add(accessionID);
302+
}
303+
304+
return accessionList;
305+
}
306+
257307
/**
258308
*
259309
* @param compounds

0 commit comments

Comments
 (0)