Skip to content

Commit 58efc46

Browse files
committed
Merge remote-tracking branch 'upstream/master' into bcif-integration
# Conflicts: # biojava-structure/src/test/java/org/biojava/nbio/structure/asa/TestAsaCalc.java
2 parents 19766ca + 103d7d4 commit 58efc46

File tree

3 files changed

+139
-118
lines changed

3 files changed

+139
-118
lines changed

biojava-structure/src/main/java/org/biojava/nbio/structure/asa/AsaCalculator.java

Lines changed: 97 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -27,13 +27,12 @@
2727
import org.slf4j.LoggerFactory;
2828

2929
import javax.vecmath.Point3d;
30+
import javax.vecmath.Vector3d;
3031
import java.util.*;
3132
import java.util.concurrent.ExecutorService;
3233
import java.util.concurrent.Executors;
3334

3435

35-
36-
3736
/**
3837
* Class to calculate Accessible Surface Areas based on
3938
* the rolling ball algorithm by Shrake and Rupley.
@@ -42,14 +41,16 @@
4241
* (now source is available at https://github.com/boscoh/asa).
4342
* Thanks to Bosco K. Ho for a great piece of code and for his fantastic blog.
4443
* <p>
44+
* A few optimizations come from Eisenhaber et al, J Comp Chemistry 1994
45+
* (https://onlinelibrary.wiley.com/doi/epdf/10.1002/jcc.540160303)
46+
* <p>
4547
* See
4648
* Shrake, A., and J. A. Rupley. "Environment and Exposure to Solvent of Protein Atoms.
4749
* Lysozyme and Insulin." JMB (1973) 79:351-371.
4850
* Lee, B., and Richards, F.M. "The interpretation of Protein Structures: Estimation of
4951
* Static Accessibility" JMB (1971) 55:379-400
5052
*
5153
* @author Jose Duarte
52-
*
5354
*/
5455
public class AsaCalculator {
5556

@@ -86,10 +87,10 @@ public class AsaCalculator {
8687

8788
private class AsaCalcWorker implements Runnable {
8889

89-
private int i;
90-
private double[] asas;
90+
private final int i;
91+
private final double[] asas;
9192

92-
public AsaCalcWorker(int i, double[] asas) {
93+
private AsaCalcWorker(int i, double[] asas) {
9394
this.i = i;
9495
this.asas = asas;
9596
}
@@ -100,15 +101,24 @@ public void run() {
100101
}
101102
}
102103

104+
static class IndexAndDistance {
105+
final int index;
106+
final double dist;
107+
IndexAndDistance(int index, double dist) {
108+
this.index = index;
109+
this.dist = dist;
110+
}
111+
}
112+
103113

104-
private Point3d[] atomCoords;
105-
private Atom[] atoms;
106-
private double[] radii;
107-
private double probe;
108-
private int nThreads;
109-
private Point3d[] spherePoints;
114+
private final Point3d[] atomCoords;
115+
private final Atom[] atoms;
116+
private final double[] radii;
117+
private final double probe;
118+
private final int nThreads;
119+
private Vector3d[] spherePoints;
110120
private double cons;
111-
private int[][] neighborIndices;
121+
private IndexAndDistance[][] neighborIndices;
112122

113123
private boolean useSpatialHashingForNeighbors;
114124

@@ -123,7 +133,6 @@ public void run() {
123133
* @param nThreads the number of parallel threads to use for the calculation
124134
* @param hetAtoms if true HET residues are considered, if false they aren't, equivalent to
125135
* NACCESS' -h option
126-
* @see StructureTools#getAllNonHAtomArray
127136
*/
128137
public AsaCalculator(Structure structure, double probe, int nSpherePoints, int nThreads, boolean hetAtoms) {
129138
this.atoms = StructureTools.getAllNonHAtomArray(structure, hetAtoms);
@@ -243,7 +252,7 @@ public GroupAsa[] getGroupAsas() {
243252
}
244253
}
245254

246-
return asas.values().toArray(new GroupAsa[asas.size()]);
255+
return asas.values().toArray(new GroupAsa[0]);
247256
}
248257

249258
/**
@@ -276,46 +285,9 @@ public double[] calculateAsas() {
276285

277286
} else {
278287
logger.debug("Will use {} threads for ASA calculation", nThreads);
279-
// NOTE the multithreaded calculation does not scale up well in some systems,
280-
// why? I guess some memory/garbage collect problem? I tried increasing Xmx in pc8201 but didn't help
281-
282-
// Following scaling tests are for 3hbx, calculating ASA of full asym unit (6 chains):
283-
284-
// SCALING test done in merlinl01 (12 cores, Xeon X5670 @ 2.93GHz, 24GB RAM)
285-
//1 threads, time: 8.8s -- x1.0
286-
//2 threads, time: 4.4s -- x2.0
287-
//3 threads, time: 2.9s -- x3.0
288-
//4 threads, time: 2.2s -- x3.9
289-
//5 threads, time: 1.8s -- x4.9
290-
//6 threads, time: 1.6s -- x5.5
291-
//7 threads, time: 1.4s -- x6.5
292-
//8 threads, time: 1.3s -- x6.9
293-
294-
// SCALING test done in pc8201 (4 cores, Core2 Quad Q9550 @ 2.83GHz, 8GB RAM)
295-
//1 threads, time: 17.2s -- x1.0
296-
//2 threads, time: 9.7s -- x1.8
297-
//3 threads, time: 7.7s -- x2.2
298-
//4 threads, time: 7.9s -- x2.2
299-
300-
// SCALING test done in eppic01 (16 cores, Xeon E5-2650 0 @ 2.00GHz, 128GB RAM)
301-
//1 threads, time: 10.7s -- x1.0
302-
//2 threads, time: 5.6s -- x1.9
303-
//3 threads, time: 3.6s -- x3.0
304-
//4 threads, time: 2.8s -- x3.9
305-
//5 threads, time: 2.3s -- x4.8
306-
//6 threads, time: 1.8s -- x6.0
307-
//7 threads, time: 1.6s -- x6.8
308-
//8 threads, time: 1.3s -- x8.0
309-
//9 threads, time: 1.3s -- x8.5
310-
//10 threads, time: 1.1s -- x10.0
311-
//11 threads, time: 1.0s -- x10.9
312-
//12 threads, time: 0.9s -- x11.4
313-
314-
315288

316289
ExecutorService threadPool = Executors.newFixedThreadPool(nThreads);
317290

318-
319291
for (int i=0;i<atomCoords.length;i++) {
320292
threadPool.submit(new AsaCalcWorker(i,asas));
321293
}
@@ -342,20 +314,20 @@ void setUseSpatialHashingForNeighbors(boolean useSpatialHashingForNeighbors) {
342314
}
343315

344316
/**
345-
* Returns list of 3d coordinates of points on a sphere using the
317+
* Returns list of 3d coordinates of points on a unit sphere using the
346318
* Golden Section Spiral algorithm.
347319
* @param nSpherePoints the number of points to be used in generating the spherical dot-density
348-
* @return
320+
* @return the array of points as Vector3d objects
349321
*/
350-
private Point3d[] generateSpherePoints(int nSpherePoints) {
351-
Point3d[] points = new Point3d[nSpherePoints];
322+
private Vector3d[] generateSpherePoints(int nSpherePoints) {
323+
Vector3d[] points = new Vector3d[nSpherePoints];
352324
double inc = Math.PI * (3.0 - Math.sqrt(5.0));
353325
double offset = 2.0 / nSpherePoints;
354326
for (int k=0;k<nSpherePoints;k++) {
355327
double y = k * offset - 1.0 + (offset / 2.0);
356328
double r = Math.sqrt(1.0 - y*y);
357329
double phi = k * inc;
358-
points[k] = new Point3d(Math.cos(phi)*r, y, Math.sin(phi)*r);
330+
points[k] = new Vector3d(Math.cos(phi)*r, y, Math.sin(phi)*r);
359331
}
360332
return points;
361333
}
@@ -364,30 +336,29 @@ private Point3d[] generateSpherePoints(int nSpherePoints) {
364336
* Returns the 2-dimensional array with neighbor indices for every atom.
365337
* @return 2-dimensional array of size: n_atoms x n_neighbors_per_atom
366338
*/
367-
int[][] findNeighborIndices() {
339+
IndexAndDistance[][] findNeighborIndices() {
368340

369341
// looking at a typical protein case, number of neighbours are from ~10 to ~50, with an average of ~30
370342
int initialCapacity = 60;
371343

372-
int[][] nbsIndices = new int[atomCoords.length][];
344+
IndexAndDistance[][] nbsIndices = new IndexAndDistance[atomCoords.length][];
373345

374346
for (int k=0; k<atomCoords.length; k++) {
375347
double radius = radii[k] + probe + probe;
376348

377-
List<Integer> thisNbIndices = new ArrayList<>(initialCapacity);
349+
List<IndexAndDistance> thisNbIndices = new ArrayList<>(initialCapacity);
378350

379351
for (int i = 0; i < atomCoords.length; i++) {
380352
if (i == k) continue;
381353

382354
double dist = atomCoords[i].distance(atomCoords[k]);
383355

384356
if (dist < radius + radii[i]) {
385-
thisNbIndices.add(i);
357+
thisNbIndices.add(new IndexAndDistance(i, dist));
386358
}
387359
}
388360

389-
int[] indicesArray = new int[thisNbIndices.size()];
390-
for (int i=0;i<thisNbIndices.size();i++) indicesArray[i] = thisNbIndices.get(i);
361+
IndexAndDistance[] indicesArray = thisNbIndices.toArray(new IndexAndDistance[0]);
391362
nbsIndices[k] = indicesArray;
392363
}
393364
return nbsIndices;
@@ -398,20 +369,20 @@ int[][] findNeighborIndices() {
398369
* using spatial hashing to avoid all to all distance calculation.
399370
* @return 2-dimensional array of size: n_atoms x n_neighbors_per_atom
400371
*/
401-
int[][] findNeighborIndicesSpatialHashing() {
372+
IndexAndDistance[][] findNeighborIndicesSpatialHashing() {
402373

403374
// looking at a typical protein case, number of neighbours are from ~10 to ~50, with an average of ~30
404375
int initialCapacity = 60;
405376

406377
List<Contact> contactList = calcContacts();
407-
Map<Integer, List<Integer>> indices = new HashMap<>(atomCoords.length);
378+
Map<Integer, List<IndexAndDistance>> indices = new HashMap<>(atomCoords.length);
408379
for (Contact contact : contactList) {
409380
// note contacts are stored 1-way only, with j>i
410381
int i = contact.getI();
411382
int j = contact.getJ();
412383

413-
List<Integer> iIndices;
414-
List<Integer> jIndices;
384+
List<IndexAndDistance> iIndices;
385+
List<IndexAndDistance> jIndices;
415386
if (!indices.containsKey(i)) {
416387
iIndices = new ArrayList<>(initialCapacity);
417388
indices.put(i, iIndices);
@@ -428,24 +399,23 @@ int[][] findNeighborIndicesSpatialHashing() {
428399
double radius = radii[i] + probe + probe;
429400
double dist = contact.getDistance();
430401
if (dist < radius + radii[j]) {
431-
iIndices.add(j);
432-
jIndices.add(i);
402+
iIndices.add(new IndexAndDistance(j, dist));
403+
jIndices.add(new IndexAndDistance(i, dist));
433404
}
434405
}
435406

436407
// convert map to array for fast access
437-
int[][] nbsIndices = new int[atomCoords.length][];
438-
for (Map.Entry<Integer, List<Integer>> entry : indices.entrySet()) {
439-
List<Integer> list = entry.getValue();
440-
int[] indicesArray = new int[list.size()];
441-
for (int i=0;i<entry.getValue().size();i++) indicesArray[i] = list.get(i);
442-
nbsIndices[entry.getKey()] = indicesArray;
408+
IndexAndDistance[][] nbsIndices = new IndexAndDistance[atomCoords.length][];
409+
for (Map.Entry<Integer, List<IndexAndDistance>> entry : indices.entrySet()) {
410+
List<IndexAndDistance> list = entry.getValue();
411+
IndexAndDistance[] indexAndDistances = list.toArray(new IndexAndDistance[0]);
412+
nbsIndices[entry.getKey()] = indexAndDistances;
443413
}
444414

445415
// important: some atoms might have no neighbors at all: we need to initialise to empty arrays
446416
for (int i=0; i<nbsIndices.length; i++) {
447417
if (nbsIndices[i] == null) {
448-
nbsIndices[i] = new int[0];
418+
nbsIndices[i] = new IndexAndDistance[0];
449419
}
450420
}
451421

@@ -474,35 +444,51 @@ private double calcSingleAsa(int i) {
474444
Point3d atom_i = atomCoords[i];
475445

476446
int n_neighbor = neighborIndices[i].length;
477-
int[] neighbor_indices = neighborIndices[i];
478-
int j_closest_neighbor = 0;
479-
double radius = probe + radii[i];
447+
IndexAndDistance[] neighbor_indices = neighborIndices[i];
448+
// Sorting by closest to farthest away neighbors achieves faster runtimes when checking for occluded
449+
// sphere sample points below. This follows the ideas exposed in
450+
// Eisenhaber et al, J Comp Chemistry 1994 (https://onlinelibrary.wiley.com/doi/epdf/10.1002/jcc.540160303)
451+
// This is essential for performance. In my tests this brings down the number of occlusion checks in loop below to
452+
// an average of n_sphere_points/10 per atom i, producing ~ x4 performance gain overall
453+
Arrays.sort(neighbor_indices, Comparator.comparingDouble(o -> o.dist));
454+
455+
double radius_i = probe + radii[i];
480456

481457
int n_accessible_point = 0;
458+
// purely for debugging
459+
int[] numDistsCalced = null;
460+
if (logger.isDebugEnabled()) numDistsCalced = new int[n_neighbor];
461+
462+
// now we precalculate anything depending only on i,j in equation 3 in Eisenhaber 1994
463+
double[] sqRadii = new double[n_neighbor];
464+
Vector3d[] aj_minus_ais = new Vector3d[n_neighbor];
465+
for (int nbArrayInd =0; nbArrayInd<n_neighbor; nbArrayInd++) {
466+
int j = neighbor_indices[nbArrayInd].index;
467+
double dist = neighbor_indices[nbArrayInd].dist;
468+
double radius_j = radii[j] + probe;
469+
// see equation 3 in Eisenhaber 1994
470+
sqRadii[nbArrayInd] = (dist*dist + radius_i*radius_i - radius_j*radius_j)/(2*radius_i);
471+
Vector3d aj_minus_ai = new Vector3d(atomCoords[j]);
472+
aj_minus_ai.sub(atom_i);
473+
aj_minus_ais[nbArrayInd] = aj_minus_ai;
474+
}
482475

483-
for (Point3d point: spherePoints){
476+
for (Vector3d point: spherePoints){
484477
boolean is_accessible = true;
485-
Point3d test_point = new Point3d(point.x*radius + atom_i.x,
486-
point.y*radius + atom_i.y,
487-
point.z*radius + atom_i.z);
488-
489-
int[] cycled_indices = new int[n_neighbor];
490-
int arind = 0;
491-
for (int ind=j_closest_neighbor;ind<n_neighbor;ind++) {
492-
cycled_indices[arind] = ind;
493-
arind++;
494-
}
495-
for (int ind=0;ind<j_closest_neighbor;ind++){
496-
cycled_indices[arind] = ind;
497-
arind++;
498-
}
499478

500-
for (int j: cycled_indices) {
501-
Point3d atom_j = atomCoords[neighbor_indices[j]];
502-
double r = radii[neighbor_indices[j]] + probe;
503-
double diff_sq = test_point.distanceSquared(atom_j);
504-
if (diff_sq < r*r) {
505-
j_closest_neighbor = j;
479+
// note that the neighbors are sorted by distance, achieving optimal performance in this inner loop
480+
// See Eisenhaber et al, J Comp Chemistry 1994
481+
482+
for (int nbArrayInd =0; nbArrayInd<n_neighbor; nbArrayInd++) {
483+
484+
// see equation 3 in Eisenhaber 1994. This is slightly more efficient than
485+
// calculating distances to the actual sphere points on atom_i (which would be obtained with:
486+
// Point3d test_point = new Point3d(point.x*radius + atom_i.x,point.y*radius + atom_i.y,point.z*radius + atom_i.z))
487+
double dotProd = aj_minus_ais[nbArrayInd].dot(point);
488+
489+
if (numDistsCalced!=null) numDistsCalced[nbArrayInd]++;
490+
491+
if (dotProd > sqRadii[nbArrayInd]) {
506492
is_accessible = false;
507493
break;
508494
}
@@ -511,7 +497,15 @@ private double calcSingleAsa(int i) {
511497
n_accessible_point++;
512498
}
513499
}
514-
return cons*n_accessible_point*radius*radius;
500+
501+
// purely for debugging
502+
if (numDistsCalced!=null) {
503+
int sum = 0;
504+
for (int numDistCalcedForJ : numDistsCalced) sum += numDistCalcedForJ;
505+
logger.debug("Number of sample points distances calculated for neighbors of i={} : average {}, all {}", i, (double) sum / (double) n_neighbor, numDistsCalced);
506+
}
507+
508+
return cons*n_accessible_point*radius_i*radius_i;
515509
}
516510

517511
/**
@@ -577,16 +571,15 @@ else if (atomCode.equals("CA") || atomCode.equals("CB") ||
577571
else if (atomCode.equals("CG")) return TETRAHEDRAL_CARBON_VDW;
578572

579573
default:
580-
logger.info("Unexpected carbon atom "+atomCode+" for aminoacid "+aa+", assigning its standard vdw radius");
574+
logger.info("Unexpected carbon atom {} for aminoacid {}, assigning its standard vdw radius", atomCode, aa);
581575
return Element.C.getVDWRadius();
582576
}
583577
}
584578

585579
// not any of the expected atoms
586580
} else {
587581
// non standard aas, (e.g. MSE, LLP) will always have this problem,
588-
logger.info("Unexpected atom "+atomCode+" for aminoacid "+aa+ " ("+amino.getPDBName()+"), assigning its standard vdw radius");
589-
582+
logger.debug("Unexpected atom {} for aminoacid {} ({}), assigning its standard vdw radius", atomCode, aa, amino.getPDBName());
590583

591584
return atom.getElement().getVDWRadius();
592585
}

0 commit comments

Comments
 (0)