Skip to content

Commit de9cc0e

Browse files
authored
Merge pull request #820 from josemduarte/asaCalcPerformance
Improved ASA calculation performance
2 parents f9bfd93 + 8fb10fd commit de9cc0e

File tree

4 files changed

+289
-55
lines changed

4 files changed

+289
-55
lines changed

biojava-structure/src/main/java/org/biojava/nbio/structure/asa/AsaCalculator.java

Lines changed: 163 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,13 @@
2121
package org.biojava.nbio.structure.asa;
2222

2323
import org.biojava.nbio.structure.*;
24+
import org.biojava.nbio.structure.contact.Contact;
25+
import org.biojava.nbio.structure.contact.Grid;
2426
import org.slf4j.Logger;
2527
import org.slf4j.LoggerFactory;
2628

2729
import javax.vecmath.Point3d;
28-
import java.util.ArrayList;
29-
import java.util.TreeMap;
30+
import java.util.*;
3031
import java.util.concurrent.ExecutorService;
3132
import java.util.concurrent.Executors;
3233

@@ -36,28 +37,34 @@
3637
/**
3738
* Class to calculate Accessible Surface Areas based on
3839
* the rolling ball algorithm by Shrake and Rupley.
39-
*
40+
* <p>
4041
* The code is adapted from a python implementation at http://boscoh.com/protein/asapy
4142
* (now source is available at https://github.com/boscoh/asa).
4243
* Thanks to Bosco K. Ho for a great piece of code and for his fantastic blog.
43-
*
44+
* <p>
4445
* See
4546
* Shrake, A., and J. A. Rupley. "Environment and Exposure to Solvent of Protein Atoms.
4647
* Lysozyme and Insulin." JMB (1973) 79:351-371.
4748
* Lee, B., and Richards, F.M. "The interpretation of Protein Structures: Estimation of
4849
* Static Accessibility" JMB (1971) 55:379-400
49-
* @author duarte_j
50+
*
51+
* @author Jose Duarte
5052
*
5153
*/
5254
public class AsaCalculator {
5355

5456
private static final Logger logger = LoggerFactory.getLogger(AsaCalculator.class);
5557

56-
// Bosco uses as default 960, Shrake and Rupley seem to use in their paper 92 (not sure if this is actually the same parameter)
57-
public static final int DEFAULT_N_SPHERE_POINTS = 960;
58+
/**
59+
* The default value for number of sphere points to sample.
60+
* See this paper for a nice study on the effect of this parameter: https://f1000research.com/articles/5-189/v1
61+
*/
62+
public static final int DEFAULT_N_SPHERE_POINTS = 1000;
5863
public static final double DEFAULT_PROBE_SIZE = 1.4;
5964
public static final int DEFAULT_NTHREADS = 1;
6065

66+
private static final boolean DEFAULT_USE_SPATIAL_HASHING = true;
67+
6168

6269

6370
// Chothia's amino acid atoms vdw radii
@@ -101,15 +108,19 @@ public void run() {
101108
private int nThreads;
102109
private Point3d[] spherePoints;
103110
private double cons;
111+
private int[][] neighborIndices;
112+
113+
private boolean useSpatialHashingForNeighbors;
104114

105115
/**
106116
* Constructs a new AsaCalculator. Subsequently call {@link #calculateAsas()}
107117
* or {@link #getGroupAsas()} to calculate the ASAs
108118
* Only non-Hydrogen atoms are considered in the calculation.
109-
* @param structure
110-
* @param probe
111-
* @param nSpherePoints
112-
* @param nThreads
119+
* @param structure the structure, all non-H atoms will be used
120+
* @param probe the probe size
121+
* @param nSpherePoints the number of points to be used in generating the spherical
122+
* dot-density, the more points the more accurate (and slower) calculation
123+
* @param nThreads the number of parallel threads to use for the calculation
113124
* @param hetAtoms if true HET residues are considered, if false they aren't, equivalent to
114125
* NACCESS' -h option
115126
* @see StructureTools#getAllNonHAtomArray
@@ -120,16 +131,15 @@ public AsaCalculator(Structure structure, double probe, int nSpherePoints, int n
120131
this.probe = probe;
121132
this.nThreads = nThreads;
122133

134+
this.useSpatialHashingForNeighbors = DEFAULT_USE_SPATIAL_HASHING;
135+
123136
// initialising the radii by looking them up through AtomRadii
124137
radii = new double[atomCoords.length];
125138
for (int i=0;i<atomCoords.length;i++) {
126139
radii[i] = getRadius(atoms[i]);
127140
}
128141

129-
// initialising the sphere points to sample
130-
spherePoints = generateSpherePoints(nSpherePoints);
131-
132-
cons = 4.0 * Math.PI / nSpherePoints;
142+
initSpherePoints(nSpherePoints);
133143
}
134144

135145
/**
@@ -148,6 +158,8 @@ public AsaCalculator(Atom[] atoms, double probe, int nSpherePoints, int nThreads
148158
this.probe = probe;
149159
this.nThreads = nThreads;
150160

161+
this.useSpatialHashingForNeighbors = DEFAULT_USE_SPATIAL_HASHING;
162+
151163
for (Atom atom:atoms) {
152164
if (atom.getElement()==Element.H)
153165
throw new IllegalArgumentException("Can't calculate ASA for an array that contains Hydrogen atoms ");
@@ -159,10 +171,7 @@ public AsaCalculator(Atom[] atoms, double probe, int nSpherePoints, int nThreads
159171
radii[i] = getRadius(atoms[i]);
160172
}
161173

162-
// initialising the sphere points to sample
163-
spherePoints = generateSpherePoints(nSpherePoints);
164-
165-
cons = 4.0 * Math.PI / nSpherePoints;
174+
initSpherePoints(nSpherePoints);
166175
}
167176

168177
/**
@@ -188,12 +197,21 @@ public AsaCalculator(Point3d[] atomCoords, double probe, int nSpherePoints, int
188197
this.probe = probe;
189198
this.nThreads = nThreads;
190199

200+
this.useSpatialHashingForNeighbors = DEFAULT_USE_SPATIAL_HASHING;
201+
191202
// initialising the radii to the given radius for all atoms
192203
radii = new double[atomCoords.length];
193204
for (int i=0;i<atomCoords.length;i++) {
194205
radii[i] = radius;
195206
}
196207

208+
initSpherePoints(nSpherePoints);
209+
}
210+
211+
private void initSpherePoints(int nSpherePoints) {
212+
213+
logger.debug("Will use {} sphere points", nSpherePoints);
214+
197215
// initialising the sphere points to sample
198216
spherePoints = generateSpherePoints(nSpherePoints);
199217

@@ -209,7 +227,7 @@ public AsaCalculator(Point3d[] atomCoords, double probe, int nSpherePoints, int
209227
*/
210228
public GroupAsa[] getGroupAsas() {
211229

212-
TreeMap<ResidueNumber, GroupAsa> asas = new TreeMap<ResidueNumber, GroupAsa>();
230+
TreeMap<ResidueNumber, GroupAsa> asas = new TreeMap<>();
213231

214232
double[] asasPerAtom = calculateAsas();
215233

@@ -238,12 +256,26 @@ public double[] calculateAsas() {
238256

239257
double[] asas = new double[atomCoords.length];
240258

259+
long start = System.currentTimeMillis();
260+
if (useSpatialHashingForNeighbors) {
261+
logger.debug("Will use spatial hashing to find neighbors");
262+
neighborIndices = findNeighborIndicesSpatialHashing();
263+
} else {
264+
logger.debug("Will not use spatial hashing to find neighbors");
265+
neighborIndices = findNeighborIndices();
266+
}
267+
long end = System.currentTimeMillis();
268+
logger.debug("Took {} s to find neighbors", (end-start)/1000.0);
269+
270+
start = System.currentTimeMillis();
241271
if (nThreads<=1) { // (i.e. it will also be 1 thread if 0 or negative number specified)
272+
logger.debug("Will use 1 thread for ASA calculation");
242273
for (int i=0;i<atomCoords.length;i++) {
243274
asas[i] = calcSingleAsa(i);
244275
}
245276

246277
} else {
278+
logger.debug("Will use {} threads for ASA calculation", nThreads);
247279
// NOTE the multithreaded calculation does not scale up well in some systems,
248280
// why? I guess some memory/garbage collect problem? I tried increasing Xmx in pc8201 but didn't help
249281

@@ -293,10 +325,22 @@ public double[] calculateAsas() {
293325
while (!threadPool.isTerminated());
294326

295327
}
328+
end = System.currentTimeMillis();
329+
logger.debug("Took {} s to calculate all {} atoms ASAs (excluding neighbors calculation)", (end-start)/1000.0, atomCoords.length);
296330

297331
return asas;
298332
}
299333

334+
/**
335+
* Set the useSpatialHashingForNeighbors flag to use spatial hashing to calculate neighbors (true) or all-to-all
336+
* distance calculation (false). Default is {@value DEFAULT_USE_SPATIAL_HASHING}.
337+
* Use for testing performance only.
338+
* @param useSpatialHashingForNeighbors the flag
339+
*/
340+
void setUseSpatialHashingForNeighbors(boolean useSpatialHashingForNeighbors) {
341+
this.useSpatialHashingForNeighbors = useSpatialHashingForNeighbors;
342+
}
343+
300344
/**
301345
* Returns list of 3d coordinates of points on a sphere using the
302346
* Golden Section Spiral algorithm.
@@ -317,36 +361,118 @@ private Point3d[] generateSpherePoints(int nSpherePoints) {
317361
}
318362

319363
/**
320-
* Returns list of indices of atoms within probe distance to atom k.
321-
* @param k index of atom for which we want neighbor indices
364+
* Returns the 2-dimensional array with neighbor indices for every atom.
365+
* @return 2-dimensional array of size: n_atoms x n_neighbors_per_atom
322366
*/
323-
private Integer[] findNeighborIndices(int k) {
367+
int[][] findNeighborIndices() {
368+
324369
// looking at a typical protein case, number of neighbours are from ~10 to ~50, with an average of ~30
325-
// Thus 40 seems to be a good compromise for the starting capacity
326-
ArrayList<Integer> neighbor_indices = new ArrayList<>(40);
370+
int initialCapacity = 60;
327371

328-
double radius = radii[k] + probe + probe;
372+
int[][] nbsIndices = new int[atomCoords.length][];
329373

330-
for (int i=0;i<atomCoords.length;i++) {
331-
if (i==k) continue;
374+
for (int k=0; k<atomCoords.length; k++) {
375+
double radius = radii[k] + probe + probe;
376+
377+
List<Integer> thisNbIndices = new ArrayList<>(initialCapacity);
378+
379+
for (int i = 0; i < atomCoords.length; i++) {
380+
if (i == k) continue;
381+
382+
double dist = atomCoords[i].distance(atomCoords[k]);
383+
384+
if (dist < radius + radii[i]) {
385+
thisNbIndices.add(i);
386+
}
387+
}
388+
389+
int[] indicesArray = new int[thisNbIndices.size()];
390+
for (int i=0;i<thisNbIndices.size();i++) indicesArray[i] = thisNbIndices.get(i);
391+
nbsIndices[k] = indicesArray;
392+
}
393+
return nbsIndices;
394+
}
395+
396+
/**
397+
* Returns the 2-dimensional array with neighbor indices for every atom,
398+
* using spatial hashing to avoid all to all distance calculation.
399+
* @return 2-dimensional array of size: n_atoms x n_neighbors_per_atom
400+
*/
401+
int[][] findNeighborIndicesSpatialHashing() {
332402

333-
double dist = atomCoords[i].distance(atomCoords[k]);
403+
// looking at a typical protein case, number of neighbours are from ~10 to ~50, with an average of ~30
404+
int initialCapacity = 60;
405+
406+
List<Contact> contactList = calcContacts();
407+
Map<Integer, List<Integer>> indices = new HashMap<>(atomCoords.length);
408+
for (Contact contact : contactList) {
409+
// note contacts are stored 1-way only, with j>i
410+
int i = contact.getI();
411+
int j = contact.getJ();
412+
413+
List<Integer> iIndices;
414+
List<Integer> jIndices;
415+
if (!indices.containsKey(i)) {
416+
iIndices = new ArrayList<>(initialCapacity);
417+
indices.put(i, iIndices);
418+
} else {
419+
iIndices = indices.get(i);
420+
}
421+
if (!indices.containsKey(j)) {
422+
jIndices = new ArrayList<>(initialCapacity);
423+
indices.put(j, jIndices);
424+
} else {
425+
jIndices = indices.get(j);
426+
}
334427

335-
if (dist < radius + radii[i]) {
336-
neighbor_indices.add(i);
428+
double radius = radii[i] + probe + probe;
429+
double dist = contact.getDistance();
430+
if (dist < radius + radii[j]) {
431+
iIndices.add(j);
432+
jIndices.add(i);
337433
}
434+
}
338435

436+
// convert map to array for fast access
437+
int[][] nbsIndices = new int[atomCoords.length][];
438+
for (Map.Entry<Integer, List<Integer>> entry : indices.entrySet()) {
439+
List<Integer> list = entry.getValue();
440+
int[] indicesArray = new int[list.size()];
441+
for (int i=0;i<entry.getValue().size();i++) indicesArray[i] = list.get(i);
442+
nbsIndices[entry.getKey()] = indicesArray;
339443
}
340444

341-
Integer[] indicesArray = new Integer[neighbor_indices.size()];
342-
indicesArray = neighbor_indices.toArray(indicesArray);
343-
return indicesArray;
445+
return nbsIndices;
446+
}
447+
448+
Point3d[] getAtomCoords() {
449+
return atomCoords;
450+
}
451+
452+
private List<Contact> calcContacts() {
453+
double maxRadius = maxValue(radii);
454+
double cutoff = maxRadius + maxRadius + probe + probe;
455+
logger.debug("Max radius is {}, cutoff is {}", maxRadius, cutoff);
456+
Grid grid = new Grid(cutoff);
457+
grid.addCoords(atomCoords);
458+
return grid.getIndicesContacts();
459+
}
460+
461+
private static double maxValue(double[] array) {
462+
double max = array[0];
463+
for (int i = 0; i < array.length; i++) {
464+
if (array[i] > max) {
465+
max = array[i];
466+
}
467+
}
468+
return max;
344469
}
345470

346471
private double calcSingleAsa(int i) {
347472
Point3d atom_i = atomCoords[i];
348-
Integer[] neighbor_indices = findNeighborIndices(i);
349-
int n_neighbor = neighbor_indices.length;
473+
474+
int n_neighbor = neighborIndices[i].length;
475+
int[] neighbor_indices = neighborIndices[i];
350476
int j_closest_neighbor = 0;
351477
double radius = probe + radii[i];
352478

@@ -497,7 +623,7 @@ private static double getRadiusForNucl(NucleotideImpl nuc, Atom atom) {
497623
*
498624
* If atom is neither part of a nucleotide nor of a standard aminoacid,
499625
* the default vdw radius for the element is returned. If atom is of
500-
* unknown type (element) the vdw radius of {@link #Element().N} is returned
626+
* unknown type (element) the vdw radius of {@link Element().N} is returned
501627
*
502628
* @param atom
503629
* @return

biojava-structure/src/main/java/org/biojava/nbio/structure/contact/StructureInterface.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -209,8 +209,8 @@ protected void setAsas(double[] asas1, double[] asas2, int nSpherePoints, int nT
209209
throw new IllegalArgumentException("The size of ASAs of complex doesn't match that of ASAs 1 + ASAs 2");
210210

211211

212-
groupAsas1 = new TreeMap<ResidueNumber, GroupAsa>();
213-
groupAsas2 = new TreeMap<ResidueNumber, GroupAsa>();
212+
groupAsas1 = new TreeMap<>();
213+
groupAsas2 = new TreeMap<>();
214214

215215
this.totalArea = 0;
216216

0 commit comments

Comments
 (0)