Skip to content

Commit ecc3133

Browse files
committed
Update CEMC algorithm to support gaps
1 parent 3e11033 commit ecc3133

7 files changed

Lines changed: 452 additions & 568 deletions

File tree

biojava-structure-gui/src/main/java/demo/DemoCEMC.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,13 +46,13 @@ public static void main(String[] args) throws IOException, StructureException, I
4646
//Immunoglobulins (MAMMOTH paper)
4747
//List<String> names = Arrays.asList("2hla.B", "3hla.B", "1cd8", "2rhe", "1tlk", "1ten", "1ttf");
4848
//Globins (MAMMOTH and MUSTA papers)
49-
//List<String> names = Arrays.asList("1mbc", "1hlb", "1thb.A", "1ith.A", "1idr.A", "1dlw", "1kr7.A", "1ew6.A", "1it2.A", "1eco", "3sdh.A", "1cg5.B", "1fhj.B", "1ird.A", "1mba", "2gdm", "1b0b", "1h97.A", "1ash.A", "1jl7.A");
49+
List<String> names = Arrays.asList("1mbc", "1hlb", "1thb.A", "1ith.A", "1idr.A", "1dlw", "1kr7.A", "1ew6.A", "1it2.A", "1eco", "3sdh.A", "1cg5.B", "1fhj.B", "1ird.A", "1mba", "2gdm", "1b0b", "1h97.A", "1ash.A", "1jl7.A");
5050
//Rossman-Fold (POSA paper)
5151
//List<String> names = Arrays.asList("d1heta2", "d1ek6a_", "d1obfo1", "2cmd", "d1np3a2", "d1bgva1", "d1id1a_", "d1id1a_", "d1oi7a1");
5252
//Circular Permutations (Bliven CECP paper) - dynamin GTP-ase with CP G-domain
5353
//List<String> names = Arrays.asList("d1u0la2", "d1jwyb_");
5454
//Circular Permutations: SAND and MFPT domains
55-
List<String> names = Arrays.asList("d2bjqa1", "d1h5pa_", "d1ufna_"); //"d1oqja"
55+
//List<String> names = Arrays.asList("d2bjqa1", "d1h5pa_", "d1ufna_"); //"d1oqja"
5656

5757
//Load the CA atoms of the structures
5858
AtomCache cache = new AtomCache();

biojava-structure/src/main/java/org/biojava/nbio/structure/align/cemc/CeMcMain.java

Lines changed: 101 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package org.biojava.nbio.structure.align.cemc;
22

33
import java.util.ArrayList;
4+
import java.util.Arrays;
45
import java.util.List;
56
import java.util.concurrent.Callable;
67
import java.util.concurrent.ExecutionException;
@@ -12,7 +13,7 @@
1213
import org.biojava.nbio.structure.StructureException;
1314
import org.biojava.nbio.structure.align.CallableStructureAlignment;
1415
import org.biojava.nbio.structure.align.MultipleStructureAligner;
15-
import org.biojava.nbio.structure.align.ce.CeMain;
16+
import org.biojava.nbio.structure.align.ce.CeCPMain;
1617
import org.biojava.nbio.structure.align.ce.ConfigStrucAligParams;
1718
import org.biojava.nbio.structure.align.model.AFPChain;
1819
import org.biojava.nbio.structure.align.multiple.Block;
@@ -28,53 +29,59 @@
2829
import org.biojava.nbio.structure.align.multiple.ReferenceSuperimposer;
2930

3031
/**
31-
* The main class of the Java implementation of the Combinatorial Extension - Monte Carlo Algorithm (CEMC),
32-
* as has been originally developed by C.Guda, E.D.Scheeff, P.E. Bourne and I.N. Shindyalov (2001).
32+
* The main class of the Java implementation of the Combinatorial Extension - Monte Carlo (CEMC) Algorithm,
33+
* as it was originally described by C.Guda, E.D.Scheeff, P.E. Bourne and I.N. Shindyalov (2001).
34+
* <p>
3335
* The original CEMC paper is available from <a href="http://psb.stanford.edu/psb-online/proceedings/psb01/guda.pdf">here</a>.
34-
*
35-
* There is still no demo on how to use this algorithm.
36+
* <p>
37+
* The usage follows the {@link MultipleStructureAligner} interface.
38+
* A Demo on how to use the algorithm can be found in {@link DemoCEMC}.
3639
*
3740
* @author Aleix Lafita
3841
*
3942
*/
40-
public class CeMcMain implements MultipleStructureAligner{
43+
public class CeMcMain implements MultipleStructureAligner {
4144

4245
/**
43-
* version history:
44-
* 1.0 - Initial code implementation from CEMC article.
46+
* Version history:<p>
47+
* 1.0 - Initial code implementation from CEMC article without partial gaps.<p>
48+
* 2.0 - Update to support CP and partial gaps.<p>
4549
*/
46-
public static final String version = "1.0";
50+
public static final String version = "2.0";
4751
public static final String algorithmName = "jCEMC";
52+
4853
private CeMcParameters params;
4954
private MultipleAlignmentEnsemble ensemble;
55+
int reference = 0;
5056

5157
/**
52-
* Default constructor. Instantiates an empty CeMcMain object with the default parameters.
58+
* Default constructor.
59+
* Default parameters are used.
5360
*/
5461
public CeMcMain(){
5562
ensemble = null;
5663
params = new CeMcParameters();
5764
}
5865

5966
/**
60-
* Creates the seed of the multiple structure alignment, before optimization, from all-to-all pairwise alignments
61-
* of the structures. The alignments are generated in parallel using the Java API for concurrency management.
62-
* The closest structure to all others is chosen as the reference and all the alignments to it are taken to generate
63-
* an ungapped seed MultipleAlignment.
64-
*
65-
* This method is static because can be used outside this alignment class.
67+
* Creates a MultipleAlignment seed for MC optimization from the representative Atoms
68+
* of the structures. If there are N structures:
69+
* <ul><li>Generate (N*(N-1))/2 all-to-all alignments in parallel using the Java API.
70+
* <li>Choose the closest structure to all others as the reference.
71+
* <li>Generate a MultipleAlignment by combining all the alignments to the reference.
72+
* </ul>
6673
*
6774
* @param atomArrays List of Atoms to align of the structures
6875
* @return MultipleAlignment seed alignment
6976
* @throws ExecutionException
7077
* @throws InterruptedException
7178
* @throws StructureException
7279
*/
73-
public MultipleAlignment generateSeed(List<Atom[]> atomArrays) throws InterruptedException, ExecutionException, StructureException{
80+
private MultipleAlignment generateSeed(List<Atom[]> atomArrays) throws InterruptedException, ExecutionException, StructureException{
7481

7582
int size = atomArrays.size();
7683

77-
//List to store the all-to-all alignments. Contains the n^2 pairwise alignments as a 2D matrix indices.
84+
//List to store the all-to-all alignments. Contains the n^2 pairwise alignments as a 2D List
7885
List<List<AFPChain>> afpAlignments = new ArrayList<List<AFPChain>>();
7986
for (int i=0; i<size; i++){
8087
afpAlignments.add(new ArrayList<AFPChain>());
@@ -86,16 +93,17 @@ public MultipleAlignment generateSeed(List<Atom[]> atomArrays) throws Interrupte
8693
ExecutorService executor = Executors.newCachedThreadPool();
8794
List<Future<AFPChain>> afpFuture = new ArrayList<Future<AFPChain>>();
8895

89-
//Create all the possible protein pairwise combinations and call the alignment
96+
//Create all the possible protein pairwise combinations (N*(N-1)/2)and call the CeCP pairwise alignment algorithm
9097
for (int i=0; i<size; i++){
9198
for (int j=i+1; j<size; j++){
9299

93-
Callable<AFPChain> worker = new CallableStructureAlignment(atomArrays.get(i), atomArrays.get(j), CeMain.algorithmName);
100+
Callable<AFPChain> worker = new CallableStructureAlignment(atomArrays.get(i), atomArrays.get(j), CeCPMain.algorithmName);
94101
Future<AFPChain> submit = executor.submit(worker);
95102
afpFuture.add(submit);
96103
}
97104
}
98105

106+
//Store the resulting AFPChains in the 2D List
99107
int index = 0; //the alignment index
100108
for (int i=0; i<size; i++){
101109
for (int j=i; j<size; j++){
@@ -108,107 +116,124 @@ public MultipleAlignment generateSeed(List<Atom[]> atomArrays) throws Interrupte
108116
}
109117
executor.shutdown(); //Finish the executor because all the pairwise alignments have been calculated.
110118

111-
//Choose the reference structure from the lowest average RMSD
119+
reference = chooseReferenceRMSD(afpAlignments);
120+
return combineReferenceAlignments(afpAlignments.get(reference), atomArrays, reference);
121+
}
122+
123+
/**
124+
* This method takes the all-to-all pairwise alignments Matrix (as a double List of AFPChain) and
125+
* calculates the structure with the lowest average RMSD against all others. The index of this structure
126+
* is returned.
127+
*
128+
* @param alignments List double containing all-to-all pairwise alignments
129+
* @return int reference index
130+
*/
131+
private static int chooseReferenceRMSD(List<List<AFPChain>> afpAlignments){
132+
133+
int size = afpAlignments.size();
134+
112135
List<Double> RMSDs = new ArrayList<Double>();
113-
for (int i=0; i<size; i++){
136+
for (int i=0; i<afpAlignments.size(); i++){
114137
double rmsd=0.0;
115138
for (int j=0; j<size; j++){
116139
if (i!=j) rmsd += afpAlignments.get(i).get(j).getTotalRmsdOpt();
117140
}
118141
RMSDs.add(rmsd);
119142
}
120-
int ref = 0;
143+
int reference = 0;
121144
for (int i=1; i<size; i++){
122-
if (RMSDs.get(i) < RMSDs.get(ref)) ref = i;
145+
if (RMSDs.get(i) < RMSDs.get(reference)) reference = i;
123146
}
124-
125-
return seedFromReference(afpAlignments.get(ref), atomArrays, ref);
147+
return reference;
126148
}
127149

128150
/**
129-
* This method takes a list of pairwise alignments to the reference structure and calculates the
130-
* MultipleAlignment resulting from them. It ignores blocks in AFPChain (flexible parts) and builds
131-
* the Blocks as the definition of MultipleAlignment dictates {@link Block}. Gaps are not included.
151+
* This method takes a list of pairwise alignments to the reference structure and calculates a
152+
* MultipleAlignment resulting from combining their residue equivalencies.
153+
* <p>
154+
* It uses the blocks in AFPChain as {@link Block}s in the MultipleAlignment, so only CP are considered,
155+
* and flexible parts are ignored.
132156
*
133157
* @param afpList the list of pairwise alignments to the reference
134158
* @param atomArrays List of Atoms of the structures
135159
* @param ref index of the reference structure
136160
* @return MultipleAlignment seed alignment
137161
* @throws StructureException
138162
*/
139-
private MultipleAlignment seedFromReference(List<AFPChain> afpList, List<Atom[]> atomArrays, int ref) throws StructureException {
163+
private static MultipleAlignment combineReferenceAlignments(List<AFPChain> afpList, List<Atom[]> atomArrays, int ref) throws StructureException {
140164

141165
int size = atomArrays.size(); //the number of structures
142166
int length = 0; //the number of residues of the reference structure
143-
if (ref==0) length = afpList.get(1).getCa1Length();
167+
if (ref==0) length = afpList.get(1).getCa1Length(); //because the 0-0 alignment is null
144168
else length = afpList.get(0).getCa2Length();
145169

146-
//Stores the alignment equivalencies of all the structures as a double List (first index reference residue, second structure nr) - note this is the inverse of AlignRes in Block
170+
//Stores the alignment equivalencies of all the structures as a double List: equivalencies[str][res]
147171
List<List<Integer>> equivalencies = new ArrayList<List<Integer>>();
148-
for (int i=0; i<length; i++){
172+
for (int str=0; str<size; str++){
149173
equivalencies.add(new ArrayList<Integer>());
150-
for (int j=0; j<size; j++){
151-
if (j==ref) equivalencies.get(i).add(i);
152-
else equivalencies.get(i).add(null);
174+
for (int res=0; res<length; res++){
175+
if (str==ref) equivalencies.get(str).add(res); //identity
176+
else equivalencies.get(str).add(null);
153177
}
154178
}
155179

156180
//Now we parse the AFPChains adding the residue equivalencies
157-
for (int j=0; j<size; j++){
158-
if (j==ref) continue; //avoid the self-comparison because it is null
159-
for (int bk=0; bk<afpList.get(j).getBlockNum(); bk++){
160-
for (int i=0; i<afpList.get(j).getOptLen()[bk]; i++){
161-
int res1 = 0;
181+
for (int str=0; str<size; str++){
182+
if (str==ref) continue; //avoid the self-comparison because it is null
183+
for (int bk=0; bk<afpList.get(str).getBlockNum(); bk++){
184+
for (int i=0; i<afpList.get(str).getOptLen()[bk]; i++){
185+
int res1 = 0; //reference index
162186
int res2 = 0;
163-
//The low index is always in the first chain and the higher in the second
164-
if(j>ref){
165-
res1 = afpList.get(j).getOptAln()[bk][0][i];
166-
res2 = afpList.get(j).getOptAln()[bk][1][i];
187+
//The low index is always in the first chain (0) and the higher in the second (1)
188+
if(str>ref){
189+
res1 = afpList.get(str).getOptAln()[bk][0][i];
190+
res2 = afpList.get(str).getOptAln()[bk][1][i];
167191
}
168-
else if (j<ref){
169-
res1 = afpList.get(j).getOptAln()[bk][1][i];
170-
res2 = afpList.get(j).getOptAln()[bk][0][i];
192+
else if (str<ref){
193+
res1 = afpList.get(str).getOptAln()[bk][1][i];
194+
res2 = afpList.get(str).getOptAln()[bk][0][i];
171195
}
172-
equivalencies.get(res1).set(j,res2);
196+
equivalencies.get(str).set(res1,res2);
173197
}
174198
}
175199
}
176200

177-
//Now that we have the equivalencies we create the MultipleAlignment
178-
MultipleAlignment seed = new MultipleAlignmentImpl(ensemble);
201+
//Now that we have translated the equivalencies we create the MultipleAlignment
202+
MultipleAlignment seed = new MultipleAlignmentImpl();
203+
seed.getEnsemble().setAtomArrays(atomArrays);
179204
BlockSet blockSet = new BlockSetImpl(seed);
180-
new BlockImpl(blockSet); //This automatically adds a Block to the Block list in BlockSet
205+
new BlockImpl(blockSet);
181206

182-
//We loop through all the equivalencies checking for gapped columns and CP
183-
//Case CP: we start a new block
184-
//Case gap: we do not include the equivalencies, the seed cannot contain gaps
185-
for (int i=0; i<length; i++){
186-
boolean gap = false;
207+
//Store last positions in the block different than null to detect CP
208+
int[] lastResidues = new int[size];
209+
Arrays.fill(lastResidues, -1);
210+
211+
//We loop through all the equivalencies checking for CP: start new Block if CP
212+
for (int pos=0; pos<length; pos++){
187213
boolean cp = false;
188-
for (int j=0; j<size; j++){
189-
if (equivalencies.get(i).get(j) == null){ //there is a gap, do not consider the
190-
gap = true;
214+
for (int str=0; str<size; str++){
215+
if (equivalencies.get(str).get(pos) == null) continue; //there is a gap, ignore position
216+
else if (equivalencies.get(str).get(pos) < lastResidues[str]){
217+
cp = true; //Because the current residue is lower than the last added for at least one structure
191218
break;
192-
}
193-
if (blockSet.getBlocks().get(blockSet.getBlocks().size()-1).length() > 0){
194-
if (equivalencies.get(i).get(j) < blockSet.getBlocks().get(blockSet.getBlocks().size()-1)
195-
.getAlignRes().get(j).get( blockSet.getBlocks().get(blockSet.getBlocks().size()-1).length()-1)){
196-
cp = true; //Because the current residue is lower than the last added for at least one structure
197-
}
198-
}
219+
} else lastResidues[str] = equivalencies.get(str).get(pos);
199220
}
200-
if (gap) continue;
201-
if (cp) //if there is a CP create a new Block
221+
if (cp){ //if there is a CP create a new Block, initialize AlignRes and clear lastResidues
202222
new BlockImpl(blockSet);
223+
Arrays.fill(lastResidues,-1);
224+
}
203225

204226
//Now add the equivalent residues into the Block AlignRes variable
205-
for (int j=0; j<size; j++){
206-
if (blockSet.getBlocks().get(blockSet.getBlocks().size()-1).getAlignRes().size() == 0) //If it is empty initialize a list for every structure
207-
for (int k=0; k<size; k++) blockSet.getBlocks().get(blockSet.getBlocks().size()-1).getAlignRes().add(new ArrayList<Integer>());
208-
blockSet.getBlocks().get(blockSet.getBlocks().size()-1).getAlignRes().get(j).add(equivalencies.get(i).get(j));
227+
for (int str=0; str<size; str++){
228+
if (blockSet.getBlocks().get(blockSet.getBlocks().size()-1).getAlignRes() == null){ //If it is empty initialize it
229+
List<List<Integer>> alnRes = new ArrayList<List<Integer>>(size);
230+
for (int k=0; k<size; k++) alnRes.add(new ArrayList<Integer>());
231+
blockSet.getBlocks().get(blockSet.getBlocks().size()-1).setAlignRes(alnRes);
232+
}
233+
blockSet.getBlocks().get(blockSet.getBlocks().size()-1).getAlignRes().get(str).add(equivalencies.get(str).get(pos));
209234
}
210235
}
211-
MultipleSuperimposer imposer= new ReferenceSuperimposer();
236+
MultipleSuperimposer imposer= new ReferenceSuperimposer(ref);
212237
imposer.superimpose(seed);
213238
return seed;
214239
}
@@ -228,8 +253,8 @@ public MultipleAlignment align(List<Atom[]> atomArrays, Object params) throws St
228253
int seed = 0;
229254

230255
//Repeat the optimization 10 times in parallel, to obtain a more robust result.
231-
for (int i=0; i<10; i++){
232-
Callable<MultipleAlignment> worker = new CeMcOptimizer(result, seed+i);
256+
for (int i=0; i<1; i++){
257+
Callable<MultipleAlignment> worker = new CeMcOptimizer(result, seed+i,reference);
233258
Future<MultipleAlignment> submit = executor.submit(worker);
234259
afpFuture.add(submit);
235260
}

0 commit comments

Comments
 (0)