Skip to content

Commit 269188f

Browse files
committed
Now the mmcif parser can read a bare-bones mmcif file.
1 parent b7aa1ba commit 269188f

5 files changed

Lines changed: 148 additions & 99 deletions

File tree

biojava-structure/src/main/java/org/biojava/nbio/structure/io/FileConvert.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
package org.biojava.nbio.structure.io;
2323

2424
import org.biojava.nbio.structure.*;
25+
import org.biojava.nbio.structure.io.mmcif.SimpleMMcifParser;
2526
import org.biojava.nbio.structure.io.mmcif.model.AtomSite;
2627
import org.biojava.nbio.core.util.XMLWriter;
2728

@@ -688,7 +689,7 @@ public String toMMCIF() {
688689

689690
StringBuilder str = new StringBuilder();
690691

691-
str.append(MMCIFFileTools.MMCIF_TOP_HEADER+"BioJava_mmCIF_file"+newline);
692+
str.append(SimpleMMcifParser.MMCIF_TOP_HEADER+"BioJava_mmCIF_file"+newline);
692693

693694
str.append(getAtomSiteHeader());
694695

biojava-structure/src/main/java/org/biojava/nbio/structure/io/MMCIFFileTools.java

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,6 @@ public class MMCIFFileTools {
4141
*/
4242
public static final String MMCIF_DEFAULT_VALUE = ".";
4343

44-
/**
45-
* The header appearing at the beginning of a mmCIF file. A "block code" can be added to it of no more than 32 chars.
46-
*/
47-
public static final String MMCIF_TOP_HEADER = "data_";
4844

4945
/**
5046
* Produces a mmCIF loop header string for the given categoryName and className.

biojava-structure/src/main/java/org/biojava/nbio/structure/io/mmcif/SimpleMMcifConsumer.java

Lines changed: 102 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -615,86 +615,9 @@ public void documentEnd() {
615615
seqResChains.add(seqres);
616616
logger.debug(" seqres: " + asym.getId() + " " + seqres + "<") ;
617617

618+
// adding the compounds (entities)
619+
addCompounds(asym);
618620

619-
int eId = 0;
620-
try {
621-
eId = Integer.parseInt(asym.getEntity_id());
622-
} catch (NumberFormatException e) {
623-
logger.warn("Could not parse mol_id from string {}. Will use 0 for creating Compound",asym.getEntity_id());
624-
}
625-
Entity e = getEntity(eId);
626-
627-
for (EntitySrcGen esg : entitySrcGens) {
628-
629-
if (! esg.getEntity_id().equals(asym.getEntity_id()))
630-
continue;
631-
632-
// found the matching EntitySrcGen
633-
// get the corresponding Entity
634-
Compound c = structure.getCompoundById(eId);
635-
if ( c == null){
636-
if (e!=null && e.getType().equals("polymer")) {
637-
c = createNewCompoundFromESG(esg, eId);
638-
c.setMolName(e.getPdbx_description());
639-
structure.addCompound(c);
640-
logger.debug("Adding Compound with entity id {} from _entity_src_syn, with name: {}",eId,c.getMolName());
641-
}
642-
}
643-
644-
}
645-
646-
for (EntitySrcNat esn : entitySrcNats) {
647-
if (! esn.getEntity_id().equals(asym.getEntity_id()))
648-
continue;
649-
650-
// found the matching EntitySrcGen
651-
// get the corresponding Entity
652-
Compound c = structure.getCompoundById(eId);
653-
if ( c == null){
654-
if (e!=null && e.getType().equals("polymer")) {
655-
c = createNewCompoundFromESN(esn, eId);
656-
c.setMolName(e.getPdbx_description());
657-
structure.addCompound(c);
658-
logger.debug("Adding Compound with entity id {} from _entity_src_syn, with name: {}",eId,c.getMolName());
659-
}
660-
}
661-
662-
}
663-
664-
for (EntitySrcSyn ess : entitySrcSyns) {
665-
if (! ess.getEntity_id().equals(asym.getEntity_id()))
666-
continue;
667-
668-
// found the matching EntitySrcGen
669-
// get the corresponding Entity
670-
Compound c = structure.getCompoundById(eId);
671-
if ( c == null){
672-
if (e!=null && e.getType().equals("polymer")) {
673-
c = createNewCompoundFromESS(ess, eId);
674-
c.setMolName(e.getPdbx_description());
675-
structure.addCompound(c);
676-
logger.debug("Adding Compound with entity id {} from _entity_src_syn, with name: {}",eId,c.getMolName());
677-
}
678-
}
679-
}
680-
681-
// for some mmCIF files like 1yrm all 3 of _entity_src_gen, _entity_src_nat and _pdbx_entity_src_syn are missing
682-
// we need to fill the Compounds in some other way:
683-
684-
Compound c = structure.getCompoundById(eId);
685-
686-
if (c==null) {
687-
c = new Compound();
688-
c.setMolId(eId);
689-
690-
// we only add the compound if a polymeric one (to match what the PDB parser does)
691-
if (e!=null && e.getType().equals("polymer")) {
692-
c.setMolName(e.getPdbx_description());
693-
structure.addCompound(c);
694-
logger.debug("Adding Compound with entity id {} from _entity, with name: {}",eId, c.getMolName());
695-
}
696-
}
697-
698621
}
699622

700623
if ( params.isAlignSeqRes() ){
@@ -712,9 +635,21 @@ public void documentEnd() {
712635
// fix the chain IDS in the current model:
713636

714637
Set<String> asymIds = asymStrandId.keySet();
638+
639+
if (asymIds.isEmpty()) {
640+
logger.warn("No asym ids mapping found in file (categories pdbx_poly_seq_scheme/pdbx_non_poly_seq_scheme). Will create fake asym ids");
641+
642+
if (structure.nrModels()==0) {
643+
logger.error("We should have some models at this point, something is wrong! We'll have an empty structure");
644+
} else {
645+
for (Chain chain : structure.getModel(0)) {
646+
asymStrandId.put(chain.getChainID(),chain.getChainID());
647+
}
648+
}
649+
}
715650

716651
for (int i =0; i< structure.nrModels() ; i++){
717-
List<Chain>model = structure.getModel(i);
652+
List<Chain> model = structure.getModel(i);
718653

719654
List<Chain> pdbChains = new ArrayList<Chain>();
720655

@@ -761,6 +696,12 @@ public void documentEnd() {
761696
while (it.hasNext()) {
762697
Chain chain = it.next();
763698
String entityId = asymId2entityId.get(chain.getInternalChainID());
699+
if (entityId==null) {
700+
// this can happen for instance if the cif file didn't have _struct_asym category at all
701+
// and thus we have no asymId2entityId mapping at all
702+
logger.warn("No entity id could be found for chain {}", chain.getInternalChainID());
703+
continue;
704+
}
764705
int eId = Integer.parseInt(entityId);
765706
// We didn't add above compounds for nonpolymeric entities, thus here if a chain is nonpolymeric
766707
// its compound won't be found. In biojava Structure data model a nonpolymeric chain does not really
@@ -985,6 +926,87 @@ private int getInternalNr(Group atomG) {
985926
return new Long(he.getId()).intValue();
986927
}
987928
}
929+
930+
private void addCompounds(StructAsym asym) {
931+
int eId = 0;
932+
try {
933+
eId = Integer.parseInt(asym.getEntity_id());
934+
} catch (NumberFormatException e) {
935+
logger.warn("Could not parse mol_id from string {}. Will use 0 for creating Compound",asym.getEntity_id());
936+
}
937+
Entity e = getEntity(eId);
938+
939+
for (EntitySrcGen esg : entitySrcGens) {
940+
941+
if (! esg.getEntity_id().equals(asym.getEntity_id()))
942+
continue;
943+
944+
// found the matching EntitySrcGen
945+
// get the corresponding Entity
946+
Compound c = structure.getCompoundById(eId);
947+
if ( c == null){
948+
if (e!=null && e.getType().equals("polymer")) {
949+
c = createNewCompoundFromESG(esg, eId);
950+
c.setMolName(e.getPdbx_description());
951+
structure.addCompound(c);
952+
logger.debug("Adding Compound with entity id {} from _entity_src_syn, with name: {}",eId,c.getMolName());
953+
}
954+
}
955+
956+
}
957+
958+
for (EntitySrcNat esn : entitySrcNats) {
959+
if (! esn.getEntity_id().equals(asym.getEntity_id()))
960+
continue;
961+
962+
// found the matching EntitySrcGen
963+
// get the corresponding Entity
964+
Compound c = structure.getCompoundById(eId);
965+
if ( c == null){
966+
if (e!=null && e.getType().equals("polymer")) {
967+
c = createNewCompoundFromESN(esn, eId);
968+
c.setMolName(e.getPdbx_description());
969+
structure.addCompound(c);
970+
logger.debug("Adding Compound with entity id {} from _entity_src_syn, with name: {}",eId,c.getMolName());
971+
}
972+
}
973+
974+
}
975+
976+
for (EntitySrcSyn ess : entitySrcSyns) {
977+
if (! ess.getEntity_id().equals(asym.getEntity_id()))
978+
continue;
979+
980+
// found the matching EntitySrcGen
981+
// get the corresponding Entity
982+
Compound c = structure.getCompoundById(eId);
983+
if ( c == null){
984+
if (e!=null && e.getType().equals("polymer")) {
985+
c = createNewCompoundFromESS(ess, eId);
986+
c.setMolName(e.getPdbx_description());
987+
structure.addCompound(c);
988+
logger.debug("Adding Compound with entity id {} from _entity_src_syn, with name: {}",eId,c.getMolName());
989+
}
990+
}
991+
}
992+
993+
// for some mmCIF files like 1yrm all 3 of _entity_src_gen, _entity_src_nat and _pdbx_entity_src_syn are missing
994+
// we need to fill the Compounds in some other way:
995+
996+
Compound c = structure.getCompoundById(eId);
997+
998+
if (c==null) {
999+
c = new Compound();
1000+
c.setMolId(eId);
1001+
1002+
// we only add the compound if a polymeric one (to match what the PDB parser does)
1003+
if (e!=null && e.getType().equals("polymer")) {
1004+
c.setMolName(e.getPdbx_description());
1005+
structure.addCompound(c);
1006+
logger.debug("Adding Compound with entity id {} from _entity, with name: {}",eId, c.getMolName());
1007+
}
1008+
}
1009+
}
9881010

9891011
private Compound createNewCompoundFromESG(EntitySrcGen esg, int eId) {
9901012

biojava-structure/src/main/java/org/biojava/nbio/structure/io/mmcif/SimpleMMcifParser.java

Lines changed: 27 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323

2424
import org.biojava.nbio.structure.Structure;
2525
import org.biojava.nbio.structure.io.MMCIFFileReader;
26-
import org.biojava.nbio.structure.io.MMCIFFileTools;
2726
import org.biojava.nbio.structure.io.StructureIOFile;
2827
import org.biojava.nbio.structure.io.mmcif.model.*;
2928
import org.biojava.nbio.structure.jama.Matrix;
@@ -43,7 +42,8 @@
4342
import java.util.List;
4443
import java.util.Set;
4544

46-
/** A simple mmCif file parser
45+
/**
46+
* A simple mmCif file parser
4747
*
4848
* @author Andreas Prlic
4949
* @since 1.7
@@ -65,8 +65,15 @@
6565
*/
6666
public class SimpleMMcifParser implements MMcifParser {
6767

68-
private List<MMcifConsumer> consumers ;
6968

69+
70+
/**
71+
* The header appearing at the beginning of a mmCIF file.
72+
* A "block code" can be added to it of no more than 32 chars.
73+
* See http://www.iucr.org/__data/assets/pdf_file/0019/22618/cifguide.pdf
74+
*/
75+
public static final String MMCIF_TOP_HEADER = "data_";
76+
7077
public static final String LOOP_END = "#";
7178
public static final String LOOP_START = "loop_";
7279
public static final String FIELD_LINE = "_";
@@ -75,6 +82,8 @@ public class SimpleMMcifParser implements MMcifParser {
7582
private static final char s1 = '\'';
7683
private static final char s2 = '\"';
7784

85+
private List<MMcifConsumer> consumers ;
86+
7887
private Struct struct ;
7988

8089
private static final Logger logger = LoggerFactory.getLogger(SimpleMMcifParser.class);
@@ -148,8 +157,8 @@ public void parse(BufferedReader buf)
148157

149158
// the first line is a data_PDBCODE line, test if this looks like a mmcif file
150159
line = buf.readLine();
151-
if (!line.startsWith("data_")){
152-
logger.error("this does not look like a valid MMcif file! The first line should be data_1XYZ, but is " + line);
160+
if (!line.startsWith(MMCIF_TOP_HEADER)){
161+
logger.error("This does not look like a valid mmCIF file! The first line should start with 'data_', but is: '" + line+"'");
153162
triggerDocumentEnd();
154163
return;
155164
}
@@ -168,6 +177,7 @@ public void parse(BufferedReader buf)
168177
category=null;
169178
loopFields.clear();
170179
loopWarnings.clear();
180+
logger.debug("Detected LOOP_END: '{}'. Toggling to inLoop=false", LOOP_END);
171181
continue;
172182

173183

@@ -184,20 +194,24 @@ public void parse(BufferedReader buf)
184194
category = spl[0];
185195
String attribute = spl[1];
186196
loopFields.add(attribute);
197+
logger.debug("Found category: {}, attribute: {}",category, attribute);
187198
if ( spl.length > 2){
188-
logger.warn("found nested attribute, not supported, yet!");
199+
logger.warn("Found nested attribute in {}, not supported yet!",txt);
189200
}
201+
190202
} else {
191203
category = txt;
204+
logger.debug("Found category: {}",category);
192205
}
193-
206+
194207

195208
} else {
196209

197210
// in loop and we found a data line
198211
lineData = processLine(line, buf, loopFields.size());
212+
logger.debug("Found a loop data line with {} data fields", lineData.size());
199213
if ( lineData.size() != loopFields.size()){
200-
logger.warn("did not find enough data fields...");
214+
logger.warn("Expected {} data fields, but found {} in line: {}",loopFields.size(),lineData.size(),line);
201215

202216
}
203217

@@ -216,6 +230,7 @@ public void parse(BufferedReader buf)
216230
inLoop = true;
217231
category=null;
218232
lineData.clear();
233+
logger.debug("Detected LOOP_START: '{}'. Toggling to inLoop=true", LOOP_START);
219234
continue;
220235
} else if (line.startsWith(LOOP_END)){
221236
inLoop = false;
@@ -240,8 +255,8 @@ public void parse(BufferedReader buf)
240255
if ( pos < 0 ) {
241256
// looks like a chem_comp file
242257
// line should start with data, otherwise something is wrong!
243-
if (! line.startsWith(MMCIFFileTools.MMCIF_TOP_HEADER)){
244-
logger.warn("This does not look like a valid MMcif file! The first line should start with 'data_', but is '" + line+"'");
258+
if (! line.startsWith(MMCIF_TOP_HEADER)){
259+
logger.warn("This does not look like a valid mmCIF file! The first line should start with 'data_', but is '" + line+"'");
245260
triggerDocumentEnd();
246261
return;
247262
}
@@ -456,7 +471,7 @@ private List<String> processLine(String line,
456471

457472
private void endLineChecks(String category,List<String> loopFields, List<String> lineData, Set<String> loopWarnings ) throws IOException{
458473

459-
474+
logger.debug("Processing category {}, with fields: {}",category,loopFields.toString());
460475
/*System.out.println("parsed the following data: " +category + " fields: "+
461476
loopFields + " DATA: " +
462477
lineData);
@@ -671,7 +686,7 @@ private void endLineChecks(String category,List<String> loopFields, List<String>
671686

672687
} else {
673688

674-
689+
logger.debug("Using a generic bean for category {}",category);
675690

676691
// trigger a generic bean that can deal with all missing data types...
677692
triggerGeneric(category,loopFields,lineData);

0 commit comments

Comments
 (0)