Skip to content

Commit 1a256ed

Browse files
authored
Merge pull request biojava#1024 from aalhossary/add_file_download_validation
Add file download validation
2 parents 99aae04 + 07e65b2 commit 1a256ed

7 files changed

Lines changed: 220 additions & 12 deletions

File tree

biojava-core/src/main/java/org/biojava/nbio/core/util/FileDownloadUtils.java

Lines changed: 132 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,12 @@
2323

2424
import java.io.File;
2525
import java.io.FileInputStream;
26+
import java.io.FileNotFoundException;
2627
import java.io.FileOutputStream;
28+
import java.io.FilenameFilter;
2729
import java.io.IOException;
2830
import java.io.InputStream;
31+
import java.io.PrintStream;
2932
import java.net.HttpURLConnection;
3033
import java.net.SocketTimeoutException;
3134
import java.net.URL;
@@ -39,14 +42,21 @@
3942
import java.nio.file.Paths;
4043
import java.nio.file.SimpleFileVisitor;
4144
import java.nio.file.attribute.BasicFileAttributes;
45+
import java.util.Scanner;
4246

4347
import org.slf4j.Logger;
4448
import org.slf4j.LoggerFactory;
4549

4650
public class FileDownloadUtils {
4751

52+
private static final String SIZE_EXT = ".size";
53+
private static final String HASH_EXT = ".hash";
4854
private static final Logger logger = LoggerFactory.getLogger(FileDownloadUtils.class);
4955

56+
public enum Hash{
57+
MD5, SHA1, SHA256, UNKNOWN
58+
}
59+
5060
/**
5161
* Copy the content of file src to dst TODO since java 1.7 this is provided
5262
* in java.nio.file.Files
@@ -154,13 +164,134 @@ public static void downloadFile(URL url, File destination) throws IOException {
154164
}
155165
}
156166

157-
logger.debug("Copying temp file {} to final location {}", tempFile, destination);
167+
logger.debug("Copying temp file [{}] to final location [{}]", tempFile, destination);
158168
copy(tempFile, destination);
159169

160170
// delete the tmp file
161171
tempFile.delete();
162172

163173
}
174+
175+
/**
176+
* Creates validation files beside a file to be downloaded.<br>
177+
* Whenever possible, for a <code>file.ext</code> file, it creates
178+
* <code>file.ext.size</code> and <code>file.hash</code> for in the same
179+
* folder where <code>file.ext</code> exists.
180+
* If the file connection size could not be deduced from the URL, no size file is created.
181+
* If <code>hashURL</code> is <code>null</code>, no hash file is created.
182+
* @param url the remote file URL to download
183+
* @param localDestination the local file to download into
184+
* @param hashURL the URL of the hash file to download. Can be <code>null</code>.
185+
* @param hash The Hashing algorithm. Ignored if <code>hashURL</code> is <code>null</code>.
186+
*/
187+
public static void createValidationFiles(URL url, File localDestination, URL hashURL, Hash hash){
188+
try {
189+
URLConnection resourceConnection = url.openConnection();
190+
createValidationFiles(resourceConnection, localDestination, hashURL, FileDownloadUtils.Hash.UNKNOWN);
191+
} catch (IOException e) {
192+
logger.warn("could not open connection to resource file due to exception: {}", e.getMessage());
193+
}
194+
}
195+
/**
196+
* Creates validation files beside a file to be downloaded.<br>
197+
* Whenever possible, for a <code>file.ext</code> file, it creates
198+
* <code>file.ext.size</code> and <code>file.hash_XXXX</code> in the same
199+
* folder where <code>file.ext</code> exists (XXXX may be DM5, SHA1, or SHA256).
200+
* If the file connection size could not be deduced from the resourceUrlConnection
201+
* {@link URLConnection}, no size file is created.
202+
* If <code>hashURL</code> is <code>null</code>, no hash file is created.<br>
203+
* <b>N.B.</b> None of the hashing algorithms is implemented (yet), because we did not need any of them yet.
204+
* @param resourceUrlConnection the remote file URLConnection to download
205+
* @param localDestination the local file to download into
206+
* @param hashURL the URL of the hash file to download. Can be <code>null</code>.
207+
* @param hash The Hashing algorithm. Ignored if <code>hashURL</code> is <code>null</code>.
208+
* @since 7.0.0
209+
*/
210+
public static void createValidationFiles(URLConnection resourceUrlConnection, File localDestination, URL hashURL, Hash hash){
211+
long size = resourceUrlConnection.getContentLengthLong();
212+
if(size == -1) {
213+
logger.warn("could not find expected file size for resource {}.", resourceUrlConnection.getURL());
214+
} else {
215+
logger.debug("Content-Length: " + size);
216+
File sizeFile = new File(localDestination.getParentFile(), localDestination.getName() + SIZE_EXT);
217+
try (PrintStream sizePrintStream = new PrintStream(sizeFile)) {
218+
sizePrintStream.print(size);
219+
sizePrintStream.close();
220+
} catch (FileNotFoundException e) {
221+
logger.warn("could not write size validation file due to exception: {}", e.getMessage());
222+
}
223+
}
224+
225+
if(hashURL == null)
226+
return;
227+
228+
if(hash == Hash.UNKNOWN)
229+
throw new IllegalArgumentException("Hash URL given but algorithm is unknown");
230+
try {
231+
File hashFile = new File(localDestination.getParentFile(), String.format("%s%s_%s", localDestination.getName(), HASH_EXT, hash));
232+
downloadFile(hashURL, hashFile);
233+
} catch (IOException e) {
234+
logger.warn("could not write validation hash file due to exception: {}", e.getMessage());
235+
}
236+
}
237+
238+
/**
239+
* Validate a local file based on pre-existing metadata files for size and hash.<br>
240+
* If the passed in <code>localFile</code> parameter is a file named <code>file.ext</code>, the function searches in the same folder for:
241+
* <ul>
242+
* <li><code>file.ext.size</code>: If found, it compares the size stored in it to the length of <code>localFile</code> (in bytes).</li>
243+
* <li><code>file.ext.hash_XXXX (where XXXX is DM5, SHA1, or SHA256)</code>: If found, it compares the size stored in it to the hash code of <code>localFile</code>.</li>
244+
* </ul>
245+
* If any of these comparisons fail, the function returns <code>false</code>. otherwise it returns true.
246+
* <p>
247+
* <b>N.B.</b> None of the 3 common verification hashing algorithms are implement yet.
248+
* @param localFile The file to validate
249+
* @return <code>false</code> if any of the size or hash code metadata files exists but its contents does not match the expected value in the file, <code>true</code> otherwise.
250+
* @since 7.0.0
251+
*/
252+
public static boolean validateFile(File localFile) {
253+
File sizeFile = new File(localFile.getParentFile(), localFile.getName() + SIZE_EXT);
254+
if(sizeFile.exists()) {
255+
Scanner scanner = null;
256+
try {
257+
scanner = new Scanner(sizeFile);
258+
long expectedSize = scanner.nextLong();
259+
long actualLSize = localFile.length();
260+
if (expectedSize != actualLSize) {
261+
logger.warn("File [{}] size ({}) does not match expected size ({}).", localFile, actualLSize, expectedSize);
262+
return false;
263+
}
264+
} catch (FileNotFoundException e) {
265+
logger.warn("could not validate size of file [{}] because no size metadata file exists.", localFile);
266+
} finally {
267+
scanner.close();
268+
}
269+
}
270+
271+
File[] hashFiles = localFile.getParentFile().listFiles(new FilenameFilter() {
272+
String hashPattern = String.format("%s%s_(%s|%s|%s)", localFile.getName(), HASH_EXT, Hash.MD5, Hash.SHA1, Hash.SHA256);
273+
@Override
274+
public boolean accept(File dir, String name) {
275+
return name.matches(hashPattern);
276+
}
277+
});
278+
if(hashFiles.length > 0) {
279+
File hashFile = hashFiles[0];
280+
String name = hashFile.getName();
281+
String algo = name.substring(name.lastIndexOf('_') + 1);
282+
switch (Hash.valueOf(algo)) {
283+
case MD5:
284+
case SHA1:
285+
case SHA256:
286+
throw new UnsupportedOperationException("Not yet implemented");
287+
case UNKNOWN:
288+
default: // No need. Already checked above
289+
throw new IllegalArgumentException("Hashing algorithm not known: " + algo);
290+
}
291+
}
292+
293+
return true;
294+
}
164295

165296
/**
166297
* Converts path to Unix convention and adds a terminating slash if it was

biojava-core/src/test/java/org/biojava/nbio/core/util/FileDownloadUtilsTest.java

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,15 @@
44
import static org.biojava.nbio.core.util.FileDownloadUtils.getFilePrefix;
55
import static org.junit.jupiter.api.Assertions.assertEquals;
66
import static org.junit.jupiter.api.Assertions.assertFalse;
7+
import static org.junit.jupiter.api.Assertions.assertThrows;
78
import static org.junit.jupiter.api.Assertions.assertTrue;
89

910
import java.io.File;
1011
import java.io.FileInputStream;
1112
import java.io.FileOutputStream;
1213
import java.io.IOException;
14+
import java.io.PrintStream;
15+
import java.net.URL;
1316
import java.nio.file.Files;
1417

1518
import org.junit.jupiter.api.Nested;
@@ -190,4 +193,50 @@ void deleteFolderTree() throws IOException{
190193
assertFalse(toDelete.exists());
191194
}
192195
}
196+
197+
@Nested
198+
class CreateValidationFiles{
199+
200+
@Test
201+
void testValidationFiles() throws IOException{
202+
URL sourceUrl = new URL("https://ftp.wwpdb.org/pub/pdb/data/structures/divided/mmCIF/45/145d.cif.gz");
203+
File destFile = new File(System.getProperty("java.io.tmpdir"), "145d.cif.gz");
204+
File sizeFile = new File(destFile.getParentFile(), destFile.getName()+".size");
205+
File hashFile = new File(destFile.getParentFile(), destFile.getName()+".hash_MD5");
206+
System.out.println(destFile.getAbsolutePath());
207+
destFile.delete();
208+
sizeFile.delete();
209+
hashFile.delete();
210+
assertFalse(destFile.exists(), "couldn't delete dest file");
211+
assertFalse(sizeFile.exists(), "couldn't delete size file");
212+
assertFalse(hashFile.exists(), "couldn't delete hash file");
213+
214+
FileDownloadUtils.downloadFile(sourceUrl, destFile);
215+
assertTrue(destFile.exists(), "couldn't create dest file");
216+
217+
assertTrue(FileDownloadUtils.validateFile(destFile), "file detected to be invalid although there are no validation files");
218+
219+
PrintStream temp1 = new PrintStream(sizeFile);
220+
temp1.print(15); // some wrong size value
221+
temp1.close();
222+
assertFalse(FileDownloadUtils.validateFile(destFile), "file not detected to be invalid although size value is wrong.");
223+
System.out.println("Just ignore the previous warning. It is expected.");
224+
225+
FileDownloadUtils.createValidationFiles(sourceUrl, destFile, null, FileDownloadUtils.Hash.UNKNOWN);
226+
assertTrue(sizeFile.exists(), "couldn't create size file");
227+
assertTrue(FileDownloadUtils.validateFile(destFile), "file not detected to be invalid although there is correct size validation file");
228+
229+
PrintStream temp2 = new PrintStream(hashFile);
230+
temp2.print("ABCD"); // some wrong hash value
231+
temp2.close();
232+
//This is not yet implemented. I am using this test for documentation purpose.
233+
assertThrows(UnsupportedOperationException.class,
234+
() -> FileDownloadUtils.validateFile(destFile),
235+
"file not detected to be invalid although hash value is wrong.");
236+
237+
destFile.delete();
238+
sizeFile.delete();
239+
hashFile.delete();
240+
}
241+
}
193242
}

biojava-structure/src/main/java/org/biojava/nbio/structure/ecod/EcodFactory.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,9 @@ public static EcodDatabase getEcodDatabase(String version) {
8989
}
9090
} catch (IOException e) {
9191
// For parsing errors, just use the requested version
92+
// TODO What about corrupted downloading errors?? Amr
93+
logger.warn("Could not get Ecod version, or file is corrupted", e);
94+
return null;
9295
}
9396
}
9497
logger.trace("Releasing EcodFactory lock after getting version "+version);

biojava-structure/src/main/java/org/biojava/nbio/structure/ecod/EcodInstallation.java

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -369,7 +369,7 @@ private boolean domainsAvailable() {
369369
try {
370370
File f = getDomainFile();
371371

372-
if (!f.exists() || f.length() <= 0 )
372+
if (! (f.exists() && FileDownloadUtils.validateFile(f)))
373373
return false;
374374

375375
// Re-download old copies of "latest"
@@ -395,8 +395,8 @@ private boolean domainsAvailable() {
395395
}
396396

397397
/**
398-
* Downloads the domains file, overwriting any existing file
399-
* @throws IOException
398+
* Downloads the domains file +/- its validation metadata, overwriting any existing file
399+
* @throws IOException in cases of file I/O, including failure to download a healthy (non-corrupted) file.
400400
*/
401401
private void downloadDomains() throws IOException {
402402
domainsFileLock.writeLock().lock();
@@ -406,7 +406,10 @@ private void downloadDomains() throws IOException {
406406
File localFile = getDomainFile();
407407

408408
logger.info("Downloading {} to: {}",domainsURL, localFile);
409+
FileDownloadUtils.createValidationFiles(domainsURL, localFile, null, FileDownloadUtils.Hash.UNKNOWN);
409410
FileDownloadUtils.downloadFile(domainsURL, localFile);
411+
if(! FileDownloadUtils.validateFile(localFile))
412+
throw new IOException("Downloaded file invalid: "+ localFile);
410413
} catch (MalformedURLException e) {
411414
logger.error("Malformed url: "+ url + DOMAINS_PATH + getDomainFilename(),e);
412415
} finally {

biojava-structure/src/main/java/org/biojava/nbio/structure/io/LocalPDBDirectory.java

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -362,7 +362,7 @@ public Structure getStructureById(PdbId pdbId) throws IOException {
362362
* for direct parsing.
363363
* @param pdbId
364364
* @return
365-
* @throws IOException
365+
* @throws IOException in cases of file I/O, including failure to download a healthy (non-corrupted) file.
366366
*/
367367
protected InputStream getInputStream(PdbId pdbId) throws IOException{
368368

@@ -373,6 +373,9 @@ protected InputStream getInputStream(PdbId pdbId) throws IOException{
373373
throw new IOException("Structure "+pdbId+" not found and unable to download.");
374374
}
375375

376+
if(! FileDownloadUtils.validateFile(file))
377+
throw new IOException("Downloaded file invalid: "+file);
378+
376379
InputStreamProvider isp = new InputStreamProvider();
377380

378381
InputStream inputStream = isp.getInputStream(file);
@@ -385,7 +388,7 @@ protected InputStream getInputStream(PdbId pdbId) throws IOException{
385388
*
386389
* Used to pre-fetch large numbers of structures.
387390
* @param pdbId
388-
* @throws IOException
391+
* @throws IOException in cases of file I/O, including failure to download a healthy (non-corrupted) file.
389392
*/
390393
public void prefetchStructure(String pdbId) throws IOException {
391394

@@ -395,6 +398,8 @@ public void prefetchStructure(String pdbId) throws IOException {
395398
if(!file.exists()) {
396399
throw new IOException("Structure "+pdbId+" not found and unable to download.");
397400
}
401+
if(! FileDownloadUtils.validateFile(file))
402+
throw new IOException("Downloaded file invalid: "+file);
398403
}
399404

400405
/**
@@ -525,14 +530,14 @@ protected File downloadStructure(PdbId pdbId) throws IOException {
525530
}
526531

527532
/**
528-
* Download a file from the ftp server, replacing any existing files if needed
533+
* Download a file from the ftp server +/- its validation metadata, replacing any existing files if needed
529534
* @param pdbId PDB ID
530535
* @param pathOnServer Path on the FTP server, e.g. data/structures/divided/pdb
531536
* @param obsolete Whether or not file should be saved to the obsolete location locally
532537
* @param existingFile if not null and checkServerFileDate is true, the last modified date of the
533538
* server file and this file will be compared to decide whether to download or not
534539
* @return
535-
* @throws IOException
540+
* @throws IOException in cases of file I/O, including failure to download a healthy (non-corrupted) file.
536541
*/
537542
private File downloadStructure(PdbId pdbId, String pathOnServer, boolean obsolete, File existingFile)
538543
throws IOException{
@@ -576,7 +581,10 @@ private File downloadStructure(PdbId pdbId, String pathOnServer, boolean obsolet
576581
logger.info("Fetching " + ftp);
577582
logger.info("Writing to "+ realFile);
578583

584+
FileDownloadUtils.createValidationFiles(url, realFile, null, FileDownloadUtils.Hash.UNKNOWN);
579585
FileDownloadUtils.downloadFile(url, realFile);
586+
if(! FileDownloadUtils.validateFile(realFile))
587+
throw new IOException("Downloaded file invalid: "+realFile);
580588

581589
return realFile;
582590
}

biojava-structure/src/main/java/org/biojava/nbio/structure/io/sifts/SiftsMappingProvider.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,10 +87,15 @@ public static List<SiftsEntity> getSiftsMapping(String pdbId) throws IOException
8787
if ( ! dest.exists()){
8888
String u = String.format(fileLoc,pdbId);
8989
URL url = new URL(u);
90+
logger.debug("Downloading SIFTS file {} validation metadata.",url);
91+
FileDownloadUtils.createValidationFiles(url, dest, null, FileDownloadUtils.Hash.UNKNOWN);
9092
logger.debug("Downloading SIFTS file {} to {}",url,dest);
9193
FileDownloadUtils.downloadFile(url, dest);
9294
}
9395

96+
if(! FileDownloadUtils.validateFile(dest))
97+
throw new IOException("Downloaded file invalid: "+dest);
98+
9499
InputStreamProvider prov = new InputStreamProvider();
95100
InputStream is = prov.getInputStream(dest);
96101
SiftsXMLParser parser = new SiftsXMLParser();

0 commit comments

Comments
 (0)