Skip to content

Commit 5840f24

Browse files
authored
Merge pull request #857 from josemduarte/optimise-sym-mates-clusterer
Optimization of subunit clusterer for quaternary sym detection of PDB deposited structures
2 parents ae2c3bd + 3b967b0 commit 5840f24

4 files changed

Lines changed: 128 additions & 87 deletions

File tree

biojava-structure/src/main/java/org/biojava/nbio/structure/cluster/SubunitCluster.java

Lines changed: 95 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@
3131
import org.biojava.nbio.core.sequence.ProteinSequence;
3232
import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
3333
import org.biojava.nbio.structure.Atom;
34+
import org.biojava.nbio.structure.Chain;
35+
import org.biojava.nbio.structure.Structure;
3436
import org.biojava.nbio.structure.StructureException;
3537
import org.biojava.nbio.structure.align.StructureAlignment;
3638
import org.biojava.nbio.structure.align.StructureAlignmentFactory;
@@ -45,6 +47,7 @@
4547
import org.biojava.nbio.structure.align.multiple.MultipleAlignmentImpl;
4648
import org.biojava.nbio.structure.align.multiple.util.MultipleAlignmentScorer;
4749
import org.biojava.nbio.structure.align.multiple.util.ReferenceSuperimposer;
50+
import org.biojava.nbio.structure.quaternary.BiologicalAssemblyBuilder;
4851
import org.biojava.nbio.structure.symmetry.core.QuatSymmetrySubunits;
4952
import org.biojava.nbio.structure.symmetry.internal.CESymmParameters;
5053
import org.biojava.nbio.structure.symmetry.internal.CeSymm;
@@ -71,11 +74,10 @@
7174
*/
7275
public class SubunitCluster {
7376

74-
private static final Logger logger = LoggerFactory
75-
.getLogger(SubunitCluster.class);
77+
private static final Logger logger = LoggerFactory.getLogger(SubunitCluster.class);
7678

77-
private List<Subunit> subunits = new ArrayList<Subunit>();
78-
private List<List<Integer>> subunitEQR = new ArrayList<List<Integer>>();
79+
private List<Subunit> subunits = new ArrayList<>();
80+
private List<List<Integer>> subunitEQR = new ArrayList<>();
7981
private int representative = -1;
8082

8183
private SubunitClustererMethod method = SubunitClustererMethod.SEQUENCE;
@@ -119,7 +121,7 @@ public SubunitCluster(Subunit subunit) {
119121

120122
subunits.add(subunit);
121123

122-
List<Integer> identity = new ArrayList<Integer>();
124+
List<Integer> identity = new ArrayList<>();
123125
for (int i = 0; i < subunit.size(); i++)
124126
identity.add(i);
125127
subunitEQR.add(identity);
@@ -179,6 +181,38 @@ public boolean isIdenticalTo(SubunitCluster other) {
179181
return thisSequence.equals(otherSequence);
180182
}
181183

184+
/**
185+
* Tells whether the other SubunitCluster contains exactly the same Subunit.
186+
* This is checked by equality of their entity identifiers if they are present.
187+
*
188+
* @param other
189+
* SubunitCluster
190+
* @return true if the SubunitClusters are identical, false otherwise
191+
*/
192+
public boolean isIdenticalByEntityIdTo(SubunitCluster other) {
193+
Structure thisStruct = this.subunits.get(this.representative).getStructure();
194+
Structure otherStruct = other.subunits.get(other.representative).getStructure();
195+
String thisName = this.subunits.get(this.representative).getName();
196+
String otherName = other.subunits.get(this.representative).getName();
197+
Chain thisChain = thisStruct.getChain(thisName);
198+
Chain otherChain = otherStruct.getChain(otherName);
199+
if (thisChain == null || otherChain == null) {
200+
logger.info("Can't determine entity ids of SubunitClusters {}-{}. Ignoring identity check by entity id",
201+
this.subunits.get(this.representative).getName(),
202+
other.subunits.get(other.representative).getName());
203+
return false;
204+
}
205+
if (thisChain.getEntityInfo() == null || otherChain.getEntityInfo() == null) {
206+
logger.info("Can't determine entity ids of SubunitClusters {}-{}. Ignoring identity check by entity id",
207+
this.subunits.get(this.representative).getName(),
208+
other.subunits.get(other.representative).getName());
209+
return false;
210+
}
211+
int thisEntityId = thisChain.getEntityInfo().getMolId();
212+
int otherEntityId = otherChain.getEntityInfo().getMolId();
213+
return thisEntityId == otherEntityId;
214+
}
215+
182216
/**
183217
* Merges the other SubunitCluster into this one if it contains exactly the
184218
* same Subunit. This is checked by {@link #isIdenticalTo(SubunitCluster)}.
@@ -192,7 +226,35 @@ public boolean mergeIdentical(SubunitCluster other) {
192226
if (!isIdenticalTo(other))
193227
return false;
194228

195-
logger.info("SubunitClusters are identical");
229+
logger.info("SubunitClusters {}-{} are identical in sequence",
230+
this.subunits.get(this.representative).getName(),
231+
other.subunits.get(other.representative).getName());
232+
233+
this.subunits.addAll(other.subunits);
234+
this.subunitEQR.addAll(other.subunitEQR);
235+
236+
return true;
237+
}
238+
239+
/**
240+
* Merges the other SubunitCluster into this one if it contains exactly the
241+
* same Subunit. This is checked by comparing the entity identifiers of the subunits
242+
* if one can be found.
243+
* Thus this only makes sense when the subunits are complete chains of a
244+
* deposited PDB entry. I
245+
*
246+
* @param other
247+
* SubunitCluster
248+
* @return true if the SubunitClusters were merged, false otherwise
249+
*/
250+
public boolean mergeIdenticalByEntityId(SubunitCluster other) {
251+
252+
if (!isIdenticalByEntityIdTo(other))
253+
return false;
254+
255+
logger.info("SubunitClusters {}-{} belong to same entity. Assuming they are identical",
256+
this.subunits.get(this.representative).getName(),
257+
other.subunits.get(other.representative).getName());
196258

197259
this.subunits.addAll(other.subunits);
198260
this.subunitEQR.addAll(other.subunitEQR);
@@ -296,13 +358,15 @@ public boolean mergeSequence(SubunitCluster other, SubunitClustererParameters pa
296358
return false;
297359
}
298360

299-
logger.info(String.format("SubunitClusters are similar in sequence "
300-
+ "with %.2f sequence identity and %.2f coverage", sequenceIdentity,
301-
sequenceCoverage));
361+
logger.info(String.format("SubunitClusters %s-%s are similar in sequence "
362+
+ "with %.2f sequence identity and %.2f coverage",
363+
this.subunits.get(this.representative).getName(),
364+
other.subunits.get(other.representative).getName(),
365+
sequenceIdentity, sequenceCoverage));
302366

303367
// If coverage and sequence identity sufficient, merge other and this
304-
List<Integer> thisAligned = new ArrayList<Integer>();
305-
List<Integer> otherAligned = new ArrayList<Integer>();
368+
List<Integer> thisAligned = new ArrayList<>();
369+
List<Integer> otherAligned = new ArrayList<>();
306370

307371
// Extract the aligned residues of both Subunit
308372
for (int p = 1; p < aligner.getPair().getLength() + 1; p++) {
@@ -318,60 +382,15 @@ public boolean mergeSequence(SubunitCluster other, SubunitClustererParameters pa
318382

319383
// Only consider residues that are part of the SubunitCluster
320384
if (this.subunitEQR.get(this.representative).contains(thisIndex)
321-
&& other.subunitEQR.get(other.representative).contains(
322-
otherIndex)) {
385+
&& other.subunitEQR.get(other.representative).contains(otherIndex)) {
323386
thisAligned.add(thisIndex);
324387
otherAligned.add(otherIndex);
325388
}
326389
}
327390

328-
// Do a List intersection to find out which EQR columns to remove
329-
List<Integer> thisRemove = new ArrayList<Integer>();
330-
List<Integer> otherRemove = new ArrayList<Integer>();
331-
332-
for (int t = 0; t < this.subunitEQR.get(this.representative).size(); t++) {
333-
// If the index is aligned do nothing, otherwise mark as removing
334-
if (!thisAligned.contains(this.subunitEQR.get(this.representative)
335-
.get(t)))
336-
thisRemove.add(t);
337-
}
338-
339-
for (int t = 0; t < other.subunitEQR.get(other.representative).size(); t++) {
340-
// If the index is aligned do nothing, otherwise mark as removing
341-
if (!otherAligned.contains(other.subunitEQR.get(
342-
other.representative).get(t)))
343-
otherRemove.add(t);
344-
}
345-
// Now remove unaligned columns, from end to start
346-
Collections.sort(thisRemove);
347-
Collections.reverse(thisRemove);
348-
Collections.sort(otherRemove);
349-
Collections.reverse(otherRemove);
350-
351-
for (int t = 0; t < thisRemove.size(); t++) {
352-
for (List<Integer> eqr : this.subunitEQR) {
353-
int column = thisRemove.get(t);
354-
eqr.remove(column);
355-
}
356-
}
357-
358-
for (int t = 0; t < otherRemove.size(); t++) {
359-
for (List<Integer> eqr : other.subunitEQR) {
360-
int column = otherRemove.get(t);
361-
eqr.remove(column);
362-
}
363-
}
364-
365-
// The representative is the longest sequence
366-
if (this.subunits.get(this.representative).size() < other.subunits.get(
367-
other.representative).size())
368-
this.representative = other.representative + subunits.size();
369-
370-
this.subunits.addAll(other.subunits);
371-
this.subunitEQR.addAll(other.subunitEQR);
391+
updateEquivResidues(other, thisAligned, otherAligned);
372392

373393
this.method = SubunitClustererMethod.SEQUENCE;
374-
375394
pseudoStoichiometric = !params.isHighConfidenceScores(sequenceIdentity,sequenceCoverage);
376395

377396
return true;
@@ -445,8 +464,8 @@ public boolean mergeStructure(SubunitCluster other, SubunitClustererParameters p
445464

446465
// Merge clusters
447466
List<List<Integer>> alignedRes = msa.getBlock(0).getAlignRes();
448-
List<Integer> thisAligned = new ArrayList<Integer>();
449-
List<Integer> otherAligned = new ArrayList<Integer>();
467+
List<Integer> thisAligned = new ArrayList<>();
468+
List<Integer> otherAligned = new ArrayList<>();
450469

451470
// Extract the aligned residues of both Subunit
452471
for (int p = 0; p < msa.length(); p++) {
@@ -469,24 +488,30 @@ public boolean mergeStructure(SubunitCluster other, SubunitClustererParameters p
469488
}
470489
}
471490

491+
updateEquivResidues(other, thisAligned, otherAligned);
492+
493+
this.method = SubunitClustererMethod.STRUCTURE;
494+
pseudoStoichiometric = true;
495+
496+
return true;
497+
}
498+
499+
private void updateEquivResidues(SubunitCluster other, List<Integer> thisAligned, List<Integer> otherAligned) {
472500
// Do a List intersection to find out which EQR columns to remove
473-
List<Integer> thisRemove = new ArrayList<Integer>();
474-
List<Integer> otherRemove = new ArrayList<Integer>();
501+
List<Integer> thisRemove = new ArrayList<>();
502+
List<Integer> otherRemove = new ArrayList<>();
475503

476504
for (int t = 0; t < this.subunitEQR.get(this.representative).size(); t++) {
477505
// If the index is aligned do nothing, otherwise mark as removing
478-
if (!thisAligned.contains(this.subunitEQR.get(this.representative)
479-
.get(t)))
506+
if (!thisAligned.contains(this.subunitEQR.get(this.representative).get(t)))
480507
thisRemove.add(t);
481508
}
482509

483510
for (int t = 0; t < other.subunitEQR.get(other.representative).size(); t++) {
484511
// If the index is aligned do nothing, otherwise mark as removing
485-
if (!otherAligned.contains(other.subunitEQR.get(
486-
other.representative).get(t)))
512+
if (!otherAligned.contains(other.subunitEQR.get(other.representative).get(t)))
487513
otherRemove.add(t);
488514
}
489-
490515
// Now remove unaligned columns, from end to start
491516
Collections.sort(thisRemove);
492517
Collections.reverse(thisRemove);
@@ -508,17 +533,12 @@ public boolean mergeStructure(SubunitCluster other, SubunitClustererParameters p
508533
}
509534

510535
// The representative is the longest sequence
511-
if (this.subunits.get(this.representative).size() < other.subunits.get(
512-
other.representative).size())
536+
if (this.subunits.get(this.representative).size() < other.subunits.get(other.representative).size())
513537
this.representative = other.representative + subunits.size();
514538

515539
this.subunits.addAll(other.subunits);
516540
this.subunitEQR.addAll(other.subunitEQR);
517541

518-
this.method = SubunitClustererMethod.STRUCTURE;
519-
pseudoStoichiometric = true;
520-
521-
return true;
522542
}
523543

524544
/**
@@ -568,9 +588,9 @@ public boolean divideInternally(SubunitClustererParameters clusterParams)
568588
List<List<Integer>> alignedRes = result.getMultipleAlignment()
569589
.getBlock(0).getAlignRes();
570590

571-
List<List<Integer>> columns = new ArrayList<List<Integer>>();
591+
List<List<Integer>> columns = new ArrayList<>();
572592
for (int s = 0; s < alignedRes.size(); s++)
573-
columns.add(new ArrayList<Integer>(alignedRes.get(s).size()));
593+
columns.add(new ArrayList<>(alignedRes.get(s).size()));
574594

575595
// Extract the aligned columns of each repeat in the Subunit
576596
for (int col = 0; col < alignedRes.get(0).size(); col++) {

biojava-structure/src/main/java/org/biojava/nbio/structure/cluster/SubunitClusterer.java

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -56,12 +56,8 @@ public static Stoichiometry cluster(Structure structure,
5656
return cluster(subunits, params);
5757
}
5858

59-
public static Stoichiometry cluster(List<Subunit> subunits,
60-
SubunitClustererParameters params) {
61-
62-
// The collection of clusters to return
63-
List<SubunitCluster> clusters = new ArrayList<SubunitCluster>();
64-
59+
public static Stoichiometry cluster(List<Subunit> subunits, SubunitClustererParameters params) {
60+
List<SubunitCluster> clusters = new ArrayList<>();
6561
if (subunits.size() == 0)
6662
return new Stoichiometry(clusters);
6763

@@ -75,7 +71,14 @@ public static Stoichiometry cluster(List<Subunit> subunits,
7571
for (int c1 = 0; c1 < clusters.size(); c1++) {
7672
for (int c2 = clusters.size() - 1; c2 > c1; c2--) {
7773
try {
78-
if (clusters.get(c1).mergeSequence(clusters.get(c2), params)) {
74+
if (params.isUseEntityIdForSeqIdentityDetermination() &&
75+
clusters.get(c1).mergeIdenticalByEntityId(clusters.get(c2))) {
76+
// This we will only do if the switch is for entity id comparison is on.
77+
// In some cases it can save enormous amounts of time, e.g. for clustering full
78+
// chains of deposited PDB entries. For instance for 6NHJ: with pure alignments it
79+
// takes ~ 6 hours, with entity id comparisons it takes 2 minutes.
80+
clusters.remove(c2);
81+
} else if (clusters.get(c1).mergeSequence(clusters.get(c2), params)) {
7982
clusters.remove(c2);
8083
}
8184

biojava-structure/src/main/java/org/biojava/nbio/structure/cluster/SubunitClustererParameters.java

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ public class SubunitClustererParameters implements Serializable {
4545
private double sequenceIdentityThreshold;
4646
private double sequenceCoverageThreshold = 0.75;
4747

48+
private boolean useEntityIdForSeqIdentityDetermination = false;
49+
4850
private double rmsdThreshold = 3.0;
4951
private double structureCoverageThreshold = 0.75;
5052
private double tmThreshold = 0.5;
@@ -506,5 +508,21 @@ public boolean isHighConfidenceScores(double sequenceIdentity, double sequenceCo
506508
return sequenceIdentity>=hcSequenceIdentityLocal && sequenceCoverage >= hcSequenceCoverageLocal;
507509
}
508510

511+
/**
512+
* Whether to use the entity id of subunits to infer that sequences are identical.
513+
* Only applies if the {@link SubunitClustererMethod} is a sequence based one.
514+
* @return
515+
*/
516+
public boolean isUseEntityIdForSeqIdentityDetermination() {
517+
return useEntityIdForSeqIdentityDetermination;
518+
}
509519

520+
/**
521+
* Whether to use the entity id of subunits to infer that sequences are identical.
522+
* Only applies if the {@link SubunitClustererMethod} is a sequence based one.
523+
* @param useEntityIdForSeqIdentityDetermination the flag to be set
524+
*/
525+
public void setUseEntityIdForSeqIdentityDetermination(boolean useEntityIdForSeqIdentityDetermination) {
526+
this.useEntityIdForSeqIdentityDetermination = useEntityIdForSeqIdentityDetermination;
527+
}
510528
}

biojava-structure/src/test/java/org/biojava/nbio/structure/cluster/TestSubunitCluster.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,8 @@ public class TestSubunitCluster {
5252
@Test
5353
public void testMergeIdentical() {
5454

55-
// Create an Atom Array of ploy-alanine
56-
List<Atom> atoms = new ArrayList<Atom>(10);
55+
// Create an Atom Array of poly-alanine
56+
List<Atom> atoms = new ArrayList<>(10);
5757
for (int i = 0; i < 10; i++) {
5858
Group g = new AminoAcidImpl();
5959
g.setPDBName("ALA");
@@ -79,7 +79,7 @@ public void testMergeIdentical() {
7979
assertEquals(sc1.length(), 10);
8080

8181
// Create an Atom Array of poly-glycine
82-
List<Atom> atoms2 = new ArrayList<Atom>(10);
82+
List<Atom> atoms2 = new ArrayList<>(10);
8383
for (int i = 0; i < 10; i++) {
8484
Group g = new AminoAcidImpl();
8585
g.setPDBName("GLY");
@@ -112,7 +112,7 @@ public void testMergeIdentical() {
112112
public void testMergeSequence() throws CompoundNotFoundException {
113113

114114
// Create an Atom Array of ploy-alanine
115-
List<Atom> atoms = new ArrayList<Atom>(100);
115+
List<Atom> atoms = new ArrayList<>(100);
116116
for (int i = 0; i < 100; i++) {
117117
Group g = new AminoAcidImpl();
118118
g.setPDBName("ALA");
@@ -163,7 +163,7 @@ public void testMergeSequence() throws CompoundNotFoundException {
163163
assertEquals(sc1.length(), 100);
164164

165165
// Create an Atom Array of 9 glycine and 91 alanine
166-
List<Atom> atoms3 = new ArrayList<Atom>(100);
166+
List<Atom> atoms3 = new ArrayList<>(100);
167167
for (int i = 0; i < 9; i++) {
168168
Group g = new AminoAcidImpl();
169169
g.setPDBName("GLY");

0 commit comments

Comments
 (0)