Skip to content

Commit 5868d90

Browse files
author
Sebastian Bittrich
committed
replaces SimpleDateFormat with mature implementation
- solves Exceptions in binary parsing + speed-up (?) - Exception were parsing exceptions from the prominent Date class when executing in parallel. Not possible to reproduce, but omitted by changing to robust, thread-safe number parsing class.
1 parent eeadc2f commit 5868d90

2 files changed

Lines changed: 50 additions & 56 deletions

File tree

biojava-integrationtest/src/test/java/org/biojava/nbio/structure/test/io/cif/CifFileConsumerImplTest.java

Lines changed: 30 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,28 +12,34 @@
1212
import java.io.UncheckedIOException;
1313
import java.nio.file.Files;
1414
import java.nio.file.Paths;
15+
import java.util.ArrayList;
16+
import java.util.Collections;
1517
import java.util.Date;
1618
import java.util.List;
1719
import java.util.concurrent.atomic.AtomicInteger;
18-
import java.util.stream.Stream;
20+
import java.util.regex.Pattern;
1921
import java.util.zip.GZIPInputStream;
2022

2123
import static org.junit.Assert.*;
2224

2325
public class CifFileConsumerImplTest {
24-
/**
25-
* java.lang.NumberFormatException: multiple points have been thrown.
26-
*/
2726
@Test
2827
@Ignore("ignored for now as Bcif file source may change - currently using local files")
29-
public void testNumberFormat() {
30-
Stream.of("1z4s", "4hec", "1dzw", "2y28").forEach(pdbId -> {
31-
System.out.println(pdbId);
32-
Structure cif = loadLocalCif(pdbId);
33-
assertNumberFormat(cif);
34-
Structure bcif = loadLocalBcif(pdbId);
35-
assertNumberFormat(bcif);
36-
});
28+
public void testFailingEntries() {
29+
Pattern.compile(", ").splitAsStream("4he8, 1z4u, 4fp1, 1blc, 4cit, 2y2y, 4exq, 2n0f, 2d9o, 2v16, 1kqv, " +
30+
"1bwo, 2k2g, 1qhd, 5mhj, 2dn3, 5pq8, 5cay, 6ms1, 2vhu, 2gi0, 3swe, 3daz, 5yel, 2pxp, 4uis, 3cs1, 3in5," +
31+
" 1sl3, 4hjc, 3hj2, 5kpi, 1gyq, 1yq8, 4yqz, 1ox3, 2pls, 1vne, 4q02, 1dtt, 1jau, 5h3b, 5sxk, 4el7, 5q7w," +
32+
" 4zuz, 1n6i, 1dhg, 3dhe, 2gpo, 5if7, 5ld8, 1jhz, 4fr3, 1r6u, 3hdl, 5fse, 1iho, 1t10, 2oc6, 3czx, 3b3o," +
33+
" 5i6w, 2ecv, 4l2x, 441d, 2i0x, 1xq4, 3tbb, 4mmz, 1qew, 6i16, 1t8d, 5w7r, 6gm1, 1s7u, 2qp3, 1cf3, 4myb," +
34+
" 1omh, 1zog, 2b68, 1nqb, 1t7k")
35+
.parallel()
36+
.forEach(pdbId -> {
37+
System.out.println(pdbId);
38+
Structure cif = loadLocalCif(pdbId);
39+
assertNumberFormat(cif);
40+
Structure bcif = loadLocalBcif(pdbId);
41+
assertNumberFormat(bcif);
42+
});
3743
}
3844

3945
private Structure loadLocalCif(String pdbId) {
@@ -63,13 +69,21 @@ private void assertNumberFormat(Structure structure) {
6369
assertNotNull(modDate);
6470
}
6571

72+
/**
73+
* Performance diary;
74+
*
75+
* 05/01/19 - ciftools v0.3.0, parallel, bcif, non-gzipped, 12 worker threads
76+
* 918 s for 151079 structures, 6073 µs per structure, failed for 0 entries
77+
*
78+
* @throws IOException propagated
79+
*/
6680
@Test
6781
@Ignore("ignore long-running test, do run to track performance")
6882
public void parseEntireArchive() throws IOException {
6983
AtomicInteger counter = new AtomicInteger(0);
70-
AtomicInteger failed = new AtomicInteger(0);
7184
long start = System.nanoTime();
7285
int chunkSize = 250;
86+
List<String> failed = Collections.synchronizedList(new ArrayList<>());
7387

7488
Files.walk(Paths.get(
7589
// change to your own paths
@@ -93,13 +107,14 @@ public void parseEntireArchive() throws IOException {
93107
} catch (Exception e) {
94108
System.err.println("failed for " + path.toFile().getAbsolutePath());
95109
e.printStackTrace();
96-
failed.incrementAndGet();
110+
failed.add(path.toFile().getName().split("\\.")[0]);
97111
}
98112
});
99113

100114
long end = System.nanoTime();
101115
System.out.println((end - start) / 1_000_000_000 + " s");
102-
System.out.println("failed for " + failed.intValue() + " structures");
116+
System.out.println("failed for " + failed.size() + " structures");
117+
System.out.println("failed ids: " + failed);
103118
}
104119

105120
private static boolean headerOnly;

biojava-structure/src/main/java/org/biojava/nbio/structure/io/cif/CifFileConsumerImpl.java

Lines changed: 20 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,11 @@
1818
import org.slf4j.LoggerFactory;
1919

2020
import javax.vecmath.Matrix4d;
21-
import java.text.ParseException;
22-
import java.text.SimpleDateFormat;
21+
import java.time.LocalDate;
22+
import java.time.LocalDateTime;
23+
import java.time.ZoneId;
24+
import java.time.format.DateTimeFormatter;
25+
import java.time.format.DateTimeFormatterBuilder;
2326
import java.util.*;
2427
import java.util.stream.Collectors;
2528
import java.util.stream.IntStream;
@@ -37,7 +40,10 @@
3740
*/
3841
class CifFileConsumerImpl implements CifFileConsumer<Structure> {
3942
private static final Logger logger = LoggerFactory.getLogger(CifFileConsumerImpl.class);
40-
private static final SimpleDateFormat DATE_FORMAT = new SimpleDateFormat("yyyy-MM-dd", Locale.US);
43+
private static final DateTimeFormatter DATE_FORMAT = new DateTimeFormatterBuilder()
44+
.parseCaseInsensitive()
45+
.appendPattern("yyyy-MM-dd")
46+
.toFormatter(Locale.US);
4147

4248
private Structure structure;
4349
private Chain currentChain;
@@ -524,36 +530,24 @@ public void consumeDatabasePDBremark(DatabasePDBRemark databasePDBremark) {
524530
}
525531
}
526532

533+
private Date convert(LocalDate localDate) {
534+
return Date.from(localDate.atStartOfDay().atZone(ZoneId.systemDefault()).toInstant());
535+
}
536+
527537
@Override
528538
public void consumeDatabasePDBrev(DatabasePDBRev databasePDBrev) {
529539
logger.debug("got a database revision:" + databasePDBrev);
530540

531541
for (int rowIndex = 0; rowIndex < databasePDBrev.getRowCount(); rowIndex++) {
532542
if (databasePDBrev.getNum().get(rowIndex) == 1) {
533543
String dateOriginal = databasePDBrev.getDateOriginal().get(rowIndex);
534-
try {
535-
Date dep = DATE_FORMAT.parse(dateOriginal);
536-
pdbHeader.setDepDate(dep);
537-
} catch (ParseException e){
538-
logger.warn("Could not parse date string '{}', deposition date will be unavailable",
539-
dateOriginal);
540-
}
544+
pdbHeader.setDepDate(convert(LocalDate.parse(dateOriginal, DATE_FORMAT)));
541545

542546
String date = databasePDBrev.getDate().get(rowIndex);
543-
try {
544-
Date rel = DATE_FORMAT.parse(date);
545-
pdbHeader.setRelDate(rel);
546-
} catch (ParseException e){
547-
logger.warn("Could not parse date string '{}', modification date will be unavailable", date);
548-
}
547+
pdbHeader.setRelDate(convert(LocalDate.parse(date, DATE_FORMAT)));
549548
} else {
550549
String dbrev = databasePDBrev.getDate().get(rowIndex);
551-
try {
552-
Date mod = DATE_FORMAT.parse(dbrev);
553-
pdbHeader.setModDate(mod);
554-
} catch (ParseException e){
555-
logger.warn("Could not parse date string '{}', modification date will be unavailable", dbrev);
556-
}
550+
pdbHeader.setModDate(convert(LocalDate.parse(dbrev, DATE_FORMAT)));
557551
}
558552
}
559553
}
@@ -677,23 +671,13 @@ public void consumePdbxAuditRevisionHistory(PdbxAuditRevisionHistory pdbxAuditRe
677671
// first entry in revision history is the release date
678672
if (pdbxAuditRevisionHistory.getOrdinal().get(rowIndex) == 1) {
679673
String release = pdbxAuditRevisionHistory.getRevisionDate().get(rowIndex);
680-
try {
681-
Date releaseDate = DATE_FORMAT.parse(release);
682-
pdbHeader.setRelDate(releaseDate);
683-
} catch (ParseException e) {
684-
logger.warn("Could not parse date string '{}', release date will be unavailable", release);
685-
}
674+
pdbHeader.setRelDate(convert(LocalDate.parse(release, DATE_FORMAT)));
686675
} else {
687676
// all other dates are revision dates;
688677
// since this method may be called multiple times,
689678
// the last revision date will "stick"
690679
String revision = pdbxAuditRevisionHistory.getRevisionDate().get(rowIndex);
691-
try {
692-
Date revisionDate = DATE_FORMAT.parse(revision);
693-
pdbHeader.setModDate(revisionDate);
694-
} catch (ParseException e) {
695-
logger.warn("Could not parse date string '{}', revision date will be unavailable", revision);
696-
}
680+
pdbHeader.setModDate(convert(LocalDate.parse(revision, DATE_FORMAT)));
697681
}
698682
}
699683
}
@@ -710,13 +694,7 @@ public void consumePdbxDatabaseStatus(PdbxDatabaseStatus pdbxDatabaseStatus) {
710694
StrColumn recvdInitialDepositionDate = pdbxDatabaseStatus.getRecvdInitialDepositionDate();
711695
if (recvdInitialDepositionDate.isDefined()) {
712696
String deposition = recvdInitialDepositionDate.get(rowIndex);
713-
714-
try {
715-
Date depositionDate = DATE_FORMAT.parse(deposition);
716-
pdbHeader.setDepDate(depositionDate);
717-
} catch (ParseException e) {
718-
logger.warn("Could not parse date string '{}', deposition date will be unavailable", deposition);
719-
}
697+
pdbHeader.setDepDate(convert(LocalDate.parse(deposition, DATE_FORMAT)));
720698
}
721699
}
722700
}
@@ -784,6 +762,7 @@ public void consumeRefine(Refine refine) {
784762
// there are 2 resolution values, one for each method
785763
// we take the last one found so that behaviour is like in PDB file parsing
786764
double lsDResHigh = refine.getLsDResHigh().get(rowIndex);
765+
// TODO this could use a check to keep reasonable values - 1.5 may be overwritten by 0.0
787766
if (pdbHeader.getResolution() != PDBHeader.DEFAULT_RESOLUTION) {
788767
logger.warn("More than 1 resolution value present, will use last one {} and discard previous {}",
789768
lsDResHigh, String.format("%4.2f",pdbHeader.getResolution()));

0 commit comments

Comments
 (0)