Skip to content

Commit bbdd683

Browse files
committed
Improvements in file downloading, trying to address biojava#227
1 parent 475c6f9 commit bbdd683

7 files changed

Lines changed: 149 additions & 95 deletions

File tree

biojava3-structure/src/main/java/org/biojava/bio/structure/io/LocalPDBDirectory.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -546,7 +546,7 @@ private File downloadStructure(String pdbId, String pathOnServer, boolean obsole
546546

547547
URL url = new URL(ftp);
548548

549-
FileDownloadUtils.downloadGzipCompressedFile(url, realFile);
549+
FileDownloadUtils.downloadFile(url, realFile);
550550

551551
return realFile;
552552
}

biojava3-structure/src/main/java/org/biojava/bio/structure/io/sifts/SiftsMappingProvider.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ public static List<SiftsEntity> getSiftsMapping(String pdbId) throws IOException
102102
String u = String.format(fileLoc,pdbId);
103103
URL url = new URL(u);
104104
logger.info("Downloading SIFTS file {} to {}",url,dest);
105-
FileDownloadUtils.downloadGzipCompressedFile(url, dest);
105+
FileDownloadUtils.downloadFile(url, dest);
106106
}
107107

108108
InputStreamProvider prov = new InputStreamProvider();

biojava3-structure/src/main/java/org/biojava/bio/structure/io/util/FileDownloadUtils.java

Lines changed: 53 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -24,19 +24,15 @@
2424
*/
2525
package org.biojava.bio.structure.io.util;
2626

27-
import java.io.BufferedReader;
2827
import java.io.File;
2928
import java.io.FileInputStream;
3029
import java.io.FileOutputStream;
3130
import java.io.IOException;
32-
import java.io.InputStream;
33-
import java.io.InputStreamReader;
34-
import java.io.OutputStream;
35-
import java.io.PrintWriter;
3631
import java.net.HttpURLConnection;
3732
import java.net.URL;
38-
import java.util.zip.GZIPInputStream;
39-
import java.util.zip.GZIPOutputStream;
33+
import java.nio.channels.Channels;
34+
import java.nio.channels.FileChannel;
35+
import java.nio.channels.ReadableByteChannel;
4036

4137
import org.slf4j.Logger;
4238
import org.slf4j.LoggerFactory;
@@ -45,36 +41,41 @@ public class FileDownloadUtils {
4541

4642
private static final Logger logger = LoggerFactory.getLogger(FileDownloadUtils.class);
4743

48-
/** Copy the content of file A to B
44+
/**
45+
* Copy the content of file src to dst
4946
* TODO since java 1.7 this is provided in java.nio.file.Files
5047
* @param src
5148
* @param dst
5249
* @throws IOException
5350
*/
54-
public static void copy(File src, File dst) throws IOException {
55-
56-
InputStream in = new FileInputStream(src);
57-
OutputStream out = new FileOutputStream(dst);
51+
public static void copy(File src, File dst) throws IOException {
52+
53+
// Took following recipe from
54+
// http://stackoverflow.com/questions/106770/standard-concise-way-to-copy-a-file-in-java
55+
// The nio package seems to be the most efficient way to copy a file
56+
FileChannel source = null;
57+
FileChannel destination = null;
5858

59-
// Transfer bytes from in to out
60-
byte[] buf = new byte[1024];
61-
int len;
62-
while ((len = in.read(buf)) > 0) {
63-
out.write(buf, 0, len);
64-
}
65-
in.close();
66-
out.close();
59+
try {
60+
source = new FileInputStream(src).getChannel();
61+
destination = new FileOutputStream(dst).getChannel();
62+
destination.transferFrom(source, 0, source.size());
63+
}
64+
finally {
65+
if(source != null) {
66+
source.close();
67+
}
68+
if(destination != null) {
69+
destination.close();
70+
}
71+
}
6772
}
6873

6974
public static String getFileExtension(File f){
7075
String fileName = f.getName();
71-
//String fname="";
72-
String ext="";
76+
String ext = "";
7377
int mid= fileName.lastIndexOf(".");
74-
//fname=fileName.substring(0,mid);
7578
ext=fileName.substring(mid+1,fileName.length());
76-
//System.out.println("File name ="+fname);
77-
//System.out.println("Extension ="+ext);
7879
return ext;
7980
}
8081

@@ -89,83 +90,49 @@ public static String getFilePrefix(File f){
8990
}
9091

9192

92-
/** Download the content provided at URL url and stores the result to a local file
93+
/**
94+
* Download the content provided at URL url and store the result to a local file,
95+
* using a temp file to cache the content in case something goes wrong in download
9396
*
9497
* @param url
9598
* @param destination
9699
* @throws IOException
97100
*/
98-
public static void downloadGzipCompressedFile(URL url, File destination) throws IOException{
99-
100-
101-
InputStream uStream = url.openStream();
102-
InputStream conn = new GZIPInputStream(uStream);
101+
public static void downloadFile(URL url, File destination) throws IOException {
103102

104103
File tempFile = File.createTempFile(getFilePrefix(destination), "."+ getFileExtension(destination));
105104

106-
// System.out.println("downloading " + url + " to " + tempFile.getAbsolutePath());
107-
FileOutputStream outPut = new FileOutputStream(tempFile);
108-
GZIPOutputStream gzOutPut = new GZIPOutputStream(outPut);
109-
PrintWriter pw = new PrintWriter(gzOutPut);
110-
111-
BufferedReader fileBuffer = new BufferedReader(new InputStreamReader(conn));
112-
String line;
113-
while ((line = fileBuffer.readLine()) != null) {
114-
pw.println(line);
105+
// Took following recipe from stackoverflow:
106+
// http://stackoverflow.com/questions/921262/how-to-download-and-save-a-file-from-internet-using-java
107+
// It seems to be the most efficient way to transfer a file
108+
// See: http://docs.oracle.com/javase/7/docs/api/java/nio/channels/FileChannel.html
109+
110+
ReadableByteChannel rbc = null;
111+
FileOutputStream fos = null;
112+
try {
113+
rbc = Channels.newChannel(url.openStream());
114+
fos = new FileOutputStream(tempFile);
115+
fos.getChannel().transferFrom(rbc, 0, Long.MAX_VALUE);
116+
fos.close();
117+
rbc.close();
115118
}
116-
pw.flush();
117-
pw.close();
118-
119-
outPut.flush();
120-
outPut.close();
121-
conn.close();
122-
uStream.close();
123-
124-
// copy file name to **real** location (without the tmpFileName)
125-
// prepare destination
126-
// System.out.println("copying to " + destination);
127-
119+
finally {
120+
if(rbc != null) {
121+
rbc.close();
122+
}
123+
if(fos != null) {
124+
fos.close();
125+
}
126+
}
127+
128+
logger.debug("Copying temp file {} to final location {}",tempFile, destination);
128129
copy(tempFile, destination);
129130

130131
// delete the tmp file
131132
tempFile.delete();
132133

133134
}
134135

135-
public static File downloadFileIfAvailable(URL url, File destination) throws IOException {
136-
137-
InputStream uStream = url.openStream();
138-
InputStream conn = new GZIPInputStream(uStream);
139-
140-
FileOutputStream outPut = null;
141-
GZIPOutputStream gzOutPut = null;
142-
File tempFile = File.createTempFile(getFilePrefix(destination), "."+ getFileExtension(destination));
143-
144-
outPut = new FileOutputStream(tempFile);
145-
gzOutPut = new GZIPOutputStream(outPut);
146-
PrintWriter pw = new PrintWriter(gzOutPut);
147-
148-
BufferedReader fileBuffer = new BufferedReader(new InputStreamReader(conn));
149-
String line;
150-
while ((line = fileBuffer.readLine()) != null) {
151-
pw.println(line);
152-
}
153-
pw.flush();
154-
pw.close();
155-
156-
outPut.flush();
157-
outPut.close();
158-
conn.close();
159-
uStream.close();
160-
161-
162-
logger.info("Writing to " + destination);
163-
164-
copy(tempFile, destination);
165-
166-
return destination;
167-
}
168-
169136
/**
170137
* Converts path to Unix convention and adds a terminating slash if it was omitted
171138
* @param path original platform dependent path

integrationtest/src/test/java/org/biojava/structure/test/align/fatcat/MyTestHelper.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,14 +34,15 @@
3434
import org.biojava.bio.structure.align.xml.AFPChainXMLConverter;
3535
import org.biojava.bio.structure.align.xml.AFPChainXMLParser;
3636
import org.biojava.bio.structure.io.FileParsingParameters;
37+
import org.biojava.bio.structure.io.LocalPDBDirectory.FetchBehavior;
3738
import org.biojava.bio.structure.io.PDBFileReader;
3839

3940

4041

4142
public class MyTestHelper
4243
{
4344

44-
//
45+
4546
public static final String pdbPath = System.getProperty("java.io.tmpdir");
4647

4748
public static String compareAlignment(String pdb1, String chain1, String pdb2, String chain2, String originalOutput, boolean doRigid){
@@ -50,7 +51,7 @@ public static String compareAlignment(String pdb1, String chain1, String pdb2, S
5051

5152
PDBFileReader pdbpars = new PDBFileReader();
5253
pdbpars.setPath(pdbPath);
53-
pdbpars.setAutoFetch(true);
54+
pdbpars.setFetchBehavior(FetchBehavior.FETCH_FILES);
5455

5556
FileParsingParameters params = new FileParsingParameters();
5657
params.setAlignSeqRes(true);
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
package org.biojava.structure.test.io;
2+
3+
import static org.junit.Assert.*;
4+
5+
import java.io.IOException;
6+
7+
import org.biojava.bio.structure.Structure;
8+
import org.biojava.bio.structure.StructureException;
9+
import org.biojava.bio.structure.align.util.AtomCache;
10+
import org.biojava.bio.structure.io.LocalPDBDirectory.FetchBehavior;
11+
import org.junit.BeforeClass;
12+
import org.junit.Test;
13+
14+
/**
15+
* A test to check the performance of AtomCache downloading
16+
*
17+
* By default it is excluded from executing by main biojava pom.
18+
* To execute use:
19+
* <pre>
20+
* mvn -Dtest=TestAtomCachePerformance test
21+
* </pre>
22+
*
23+
* @author duarte_j
24+
*
25+
*/
26+
public class TestAtomCachePerformance {
27+
28+
private static final String[] PDB_IDS = {
29+
"1zjo", "2dqc", "4af2", "1r52", "4f3u", "1f9v", "3kuq", "2yr4", "3m4f", "4j5p",
30+
"7ccp", "4kro", "1x7q", "2gaw", "2kli", "2bdo", "3csf", "1muu", "190l", "2ecm"
31+
};//,
32+
//"2f0y", "3ind", "3uu6", "1p9j", "1vm7", "2y2c", "2hez", "1yrm", "1yzx", "1ps9",
33+
//"3ue0", "2o0o", "2g59", "4ees", "2yfc", "2anr", "3cxk", "2e7t", "3kmh", "3h00",
34+
//"3gdm", "1c0t", "1fi0", "2kqt", "1ky8", "169l", "1z6h", "1wbm", "4g1j", "1v3c",
35+
//"2chm", "4f0n", "2vxb", "2w0q", "1g1n", "3o6g", "4eug", "3nrm", "3heo", "4ewe",
36+
//"2xjb", "1vgj", "3tpp", "2gnl", "3jpz", "2pgt", "1fn2", "2h13", "1xyj", "1ds7",
37+
//"2x93", "4j5y", "2bk2", "1v83", "4lj9", "4ahc", "1m34", "1jo4", "3flb", "2cb2",
38+
//"4k3p", "1yq8", "2h7z", "2lbp", "3vas", "4jwn", "2e47", "3r43", "3edd", "3kss",
39+
//"2dnk", "1kg2", "2pwh", "1sjh", "4cc0", "3a7c", "1o5a", "4fu7", "3hc4", "3hoz"
40+
//};
41+
42+
private static AtomCache cache;
43+
44+
@BeforeClass
45+
public static void setUpBeforeClass() {
46+
cache = new AtomCache();
47+
cache.setFetchBehavior(FetchBehavior.FORCE_DOWNLOAD);
48+
}
49+
50+
@Test
51+
public void testDownload() throws IOException, StructureException {
52+
System.out.println("Starting performance test for "+PDB_IDS.length+" PDB ids");
53+
long start = System.currentTimeMillis();
54+
for (String pdbId:PDB_IDS) {
55+
Structure cifS = getCifStructure(pdbId);
56+
Structure pdbS = getPdbStructure(pdbId);
57+
assertNotNull(cifS);
58+
assertNotNull(pdbS);
59+
assertEquals(pdbId,cifS.getPDBCode().toLowerCase());
60+
assertEquals(cifS.getPDBCode(),pdbS.getPDBCode());
61+
62+
//System.out.print(".");
63+
64+
}
65+
66+
System.out.println();
67+
68+
long end = System.currentTimeMillis();
69+
70+
System.out.printf("Done in %5.1f s\n",(end-start)/1000.0);
71+
}
72+
73+
private Structure getCifStructure(String pdbId) throws IOException, StructureException {
74+
cache.setUseMmCif(true);
75+
76+
return cache.getStructure(pdbId);
77+
78+
}
79+
80+
private Structure getPdbStructure(String pdbId) throws IOException, StructureException {
81+
cache.setUseMmCif(false);
82+
83+
return cache.getStructure(pdbId);
84+
85+
}
86+
}

integrationtest/src/test/java/org/biojava/structure/test/io/TestLongPdbVsMmCifParsing.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
import org.biojava.bio.structure.StructureException;
3232
import org.biojava.bio.structure.align.util.AtomCache;
3333
import org.biojava.bio.structure.io.FileParsingParameters;
34+
import org.biojava.bio.structure.io.LocalPDBDirectory.ObsoleteBehavior;
3435
import org.biojava.bio.structure.quaternary.BioAssemblyInfo;
3536
import org.biojava.bio.structure.xtal.CrystalCell;
3637
import org.junit.After;
@@ -47,11 +48,11 @@
4748
* Will take very long to run, thus they are excluded by default in the pom.
4849
* To run them use, for the 1000 entries one:
4950
* <pre>
50-
* mvn -DPDB_DIR=/my/pdb/dir -Dtest=TestLongPdbVsMmCifParsing#testLongPdbVsMmCif test
51+
* mvn -Dtest=TestLongPdbVsMmCifParsing#testLongPdbVsMmCif test
5152
* </pre>
5253
* or for the 10000 entries:
5354
* <pre>
54-
* mvn -DPDB_DIR=/my/pdb/dir -Dtest=TestLongPdbVsMmCifParsing#testVeryLongPdbVsMmCif test
55+
* mvn -Dtest=TestLongPdbVsMmCifParsing#testVeryLongPdbVsMmCif test
5556
* </pre>
5657
*
5758
*
@@ -102,8 +103,7 @@ public static void setUpBeforeClass() {
102103

103104
params = new FileParsingParameters();
104105
cache.setFileParsingParams(params);
105-
106-
cache.setFetchCurrent(true);
106+
cache.setObsoleteBehavior(ObsoleteBehavior.THROW_EXCEPTION);
107107
}
108108

109109
@Test

pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -309,7 +309,7 @@
309309
<!-- we exclude by default the long parsing tests, they will only be
310310
executed if explicitly called -->
311311
<exclude>**/TestLongPdbVsMmCifParsing.java</exclude>
312-
312+
<exclude>**/TestAtomCachePerformance.java</exclude>
313313
</excludes>
314314
</configuration>
315315
</plugin>

0 commit comments

Comments
 (0)