Skip to content

Commit 7a4693e

Browse files
committed
added alignment output formatting; outputs to CLUSTALW's ALN, FASTA, and GCG's MSF formats
git-svn-id: http://code.open-bio.org/repos/biojava/biojava-live/trunk@8202 7c6358e6-4a41-0410-a743-a5b2a554c398
1 parent 041636c commit 7a4693e

3 files changed

Lines changed: 219 additions & 10 deletions

File tree

biojava3-alignment/src/main/java/org/biojava3/alignment/SimpleProfile.java

Lines changed: 156 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,11 @@
3232
import org.biojava3.alignment.template.AlignedSequence.Step;
3333
import org.biojava3.alignment.template.Profile;
3434
import org.biojava3.alignment.template.ProfileView;
35+
import org.biojava3.core.sequence.Strand;
36+
import org.biojava3.core.sequence.compound.AmbiguityDNACompoundSet;
37+
import org.biojava3.core.sequence.compound.AmbiguityRNACompoundSet;
38+
import org.biojava3.core.sequence.compound.DNACompoundSet;
39+
import org.biojava3.core.sequence.compound.RNACompoundSet;
3540
import org.biojava3.core.sequence.location.template.Location;
3641
import org.biojava3.core.sequence.template.Compound;
3742
import org.biojava3.core.sequence.template.CompoundSet;
@@ -147,6 +152,8 @@ protected SimpleProfile(Profile<S, C> query, Profile<S, C> target, List<Step> sx
147152
length = sx.size();
148153
}
149154

155+
// methods for Profile
156+
150157
@Override
151158
public AlignedSequence<S, C> getAlignedSequence(int listIndex) {
152159
return list.get(listIndex - 1);
@@ -328,23 +335,164 @@ public boolean isCircular() {
328335

329336
@Override
330337
public String toString(int width) {
331-
// TODO String toString(int)
332-
return null;
338+
return toString(width, null, getIDFormat(), true, true, true, true, true);
333339
}
334340

335341
@Override
336-
public String toString() {
337-
// TODO handle circular alignments
338-
StringBuilder s = new StringBuilder();
339-
for (AlignedSequence<S, C> as : list) {
340-
s.append(String.format("%s%n", as.toString()));
342+
public String toString(StringFormat format) {
343+
switch (format) {
344+
case ALN:
345+
case CLUSTALW:
346+
default:
347+
return toString(60, String.format("CLUSTAL W MSA from BioJava%n%n"), getIDFormat() + " ", false, true,
348+
true, false, true);
349+
case FASTA:
350+
return toString(60, null, ">%s%n", false, false, false, false, false);
351+
case GCG:
352+
case MSF:
353+
String idFormat = getIDFormat();
354+
StringBuilder header = new StringBuilder();
355+
header.append(String.format("MSA from BioJava%n%n MSF: %d Type: %s Check: %d ..%n%n", getLength(),
356+
getGCGType(), getGCGChecksum()));
357+
for (AlignedSequence<S, C> as : list) {
358+
header.append(String.format(" Name: " + idFormat + " Len: %d Check: %4d Weight: %.1f%n",
359+
as.getAccession(), getLength(), getGCGChecksum(as), 1.0f)); // TODO show weights in MSF header
360+
}
361+
header.append(String.format("%n//%n%n"));
362+
// TODO? convert gap characters to '.'
363+
return toString(50, header.toString(), idFormat, false, false, true, false, false);
341364
}
342-
return s.toString();
343365
}
344366

367+
// method from Object
368+
369+
@Override
370+
public String toString() {
371+
return toString(getLength(), null, null, false, false, false, false, false);
372+
}
373+
374+
// method for Iterable
375+
345376
@Override
346377
public Iterator<AlignedSequence<S, C>> iterator() {
347378
return list.iterator();
348379
}
349380

381+
// helper methods
382+
383+
// calculates GCG checksum for entire Profile
384+
private int getGCGChecksum() {
385+
int check = 0;
386+
for (AlignedSequence<S, C> as : list) {
387+
check += getGCGChecksum(as);
388+
}
389+
return check % 10000;
390+
}
391+
392+
// calculates GCG checksum for a given Sequence
393+
private int getGCGChecksum(AlignedSequence<S, C> sequence) {
394+
String s = sequence.toString().toUpperCase();
395+
int count = 0, check = 0;
396+
for (int i = 0; i < s.length(); i++) {
397+
count++;
398+
check += count * s.charAt(i);
399+
if (count == 57) {
400+
count = 0;
401+
}
402+
}
403+
return check % 10000;
404+
}
405+
406+
// determines GCG type
407+
private String getGCGType() {
408+
CompoundSet<C> cs = getCompoundSet();
409+
return (cs == DNACompoundSet.getDNACompoundSet() || cs == AmbiguityDNACompoundSet.getDNACompoundSet()) ? "D" :
410+
(cs == RNACompoundSet.getRNACompoundSet() || cs == AmbiguityRNACompoundSet.getRNACompoundSet()) ? "R" :
411+
"P";
412+
}
413+
414+
// creates format String for accession IDs
415+
private String getIDFormat() {
416+
int length = 0;
417+
for (AlignedSequence<S, C> as : list) {
418+
length = Math.max(length, (as.getAccession() == null) ? 0 : as.getAccession().toString().length());
419+
}
420+
return (length == 0) ? null : "%-" + (length + 1) + "s";
421+
}
422+
423+
// creates formatted String
424+
private String toString(int width, String header, String idFormat, boolean seqIndexPre, boolean seqIndexPost,
425+
boolean interlaced, boolean aligIndices, boolean aligConservation) {
426+
// TODO handle circular alignments
427+
StringBuilder s = (header == null) ? new StringBuilder() : new StringBuilder(header);
428+
width = Math.max(1, width);
429+
int seqIndexPad = (int) (Math.floor(Math.log10(getLength())) + 2);
430+
String seqIndexFormatPre = "%" + seqIndexPad + "d ", seqIndexFormatPost = "%" + seqIndexPad + "d";
431+
if (interlaced) {
432+
String aligIndFormat = "%-" + Math.max(1, width / 2) + "d %" + Math.max(1, width - (width / 2) - 1) +
433+
"d%n";
434+
for (int i = 0; i < getLength(); i += width) {
435+
int start = i + 1, end = Math.min(getLength(), i + width);
436+
if (i > 0) {
437+
s.append(String.format("%n"));
438+
}
439+
if (aligIndices) {
440+
if (end < i + width) {
441+
int line = end - start + 1;
442+
aligIndFormat = "%-" + Math.max(1, line / 2) + "d %" + Math.max(1, line - (line / 2) - 1) +
443+
"d%n";
444+
}
445+
if (idFormat != null) {
446+
s.append(String.format(idFormat, ""));
447+
}
448+
if (seqIndexPre) {
449+
s.append(String.format("%" + (seqIndexPad + 1) + "s", ""));
450+
}
451+
s.append(String.format(aligIndFormat, start, end));
452+
}
453+
for (AlignedSequence<S, C> as : list) {
454+
if (idFormat != null) {
455+
s.append(String.format(idFormat, as.getAccession()));
456+
}
457+
if (seqIndexPre) {
458+
s.append(String.format(seqIndexFormatPre, as.getSequenceIndexAt(start)));
459+
}
460+
s.append(as.getSequenceAsString(start, end, Strand.UNDEFINED));
461+
if (seqIndexPost) {
462+
s.append(String.format(seqIndexFormatPost, as.getSequenceIndexAt(end)));
463+
}
464+
s.append(String.format("%n"));
465+
}
466+
if (aligConservation) {
467+
if (idFormat != null) {
468+
s.append(String.format(idFormat, ""));
469+
}
470+
if (seqIndexPre) {
471+
s.append(String.format("%" + (seqIndexPad + 1) + "s", ""));
472+
}
473+
// TODO conservation annotation
474+
s.append(String.format("%n"));
475+
}
476+
}
477+
} else {
478+
for (AlignedSequence<S, C> as : list) {
479+
if (idFormat != null) {
480+
s.append(String.format(idFormat, as.getAccession()));
481+
}
482+
for (int i = 0; i < getLength(); i += width) {
483+
int start = i + 1, end = Math.min(getLength(), i + width);
484+
if (seqIndexPre) {
485+
s.append(String.format(seqIndexFormatPre, as.getSequenceIndexAt(start)));
486+
}
487+
s.append(as.getSequenceAsString(start, end, Strand.UNDEFINED));
488+
if (seqIndexPost) {
489+
s.append(String.format(seqIndexFormatPost, as.getSequenceIndexAt(end)));
490+
}
491+
s.append(String.format("%n"));
492+
}
493+
}
494+
}
495+
return s.toString();
496+
}
497+
350498
}

biojava3-alignment/src/main/java/org/biojava3/alignment/template/Profile.java

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,17 @@
4040
*/
4141
public interface Profile<S extends Sequence<C>, C extends Compound> extends Iterable<AlignedSequence<S, C>> {
4242

43+
/**
44+
* List of output formats.
45+
*/
46+
enum StringFormat {
47+
ALN,
48+
CLUSTALW,
49+
FASTA,
50+
GCG,
51+
MSF
52+
}
53+
4354
/**
4455
* Returns {@link AlignedSequence} at given index.
4556
*
@@ -253,4 +264,12 @@ public interface Profile<S extends Sequence<C>, C extends Compound> extends Iter
253264
*/
254265
String toString(int width);
255266

267+
/**
268+
* Returns a formatted view of the alignment profile. Details depend on the format given.
269+
*
270+
* @param format output format
271+
* @return a formatted view of the alignment profile
272+
*/
273+
String toString(StringFormat format);
274+
256275
}

biojava3-alignment/src/test/java/org/biojava3/alignment/SimpleProfileTest.java

Lines changed: 44 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@
3131
import org.biojava3.alignment.template.AlignedSequence;
3232
import org.biojava3.alignment.template.AlignedSequence.Step;
3333
import org.biojava3.alignment.template.Profile;
34+
import org.biojava3.alignment.template.Profile.StringFormat;
35+
import org.biojava3.core.sequence.AccessionID;
3436
import org.biojava3.core.sequence.ProteinSequence;
3537
import org.biojava3.core.sequence.compound.AminoAcidCompound;
3638
import org.biojava3.core.sequence.compound.AminoAcidCompoundSet;
@@ -47,6 +49,8 @@ public class SimpleProfileTest {
4749
public void setup() {
4850
query = new ProteinSequence("ARND");
4951
target = new ProteinSequence("RDG");
52+
query.setAccession(new AccessionID("Query"));
53+
target.setAccession(new AccessionID("Target"));
5054
global = new SimpleProfile<ProteinSequence, AminoAcidCompound>(query, target, Arrays.asList(new Step[] {
5155
Step.COMPOUND, Step.COMPOUND, Step.COMPOUND, Step.COMPOUND, Step.GAP}), 0, 0, Arrays.asList(
5256
new Step[] {Step.GAP, Step.COMPOUND, Step.GAP, Step.COMPOUND, Step.COMPOUND}), 0, 0);
@@ -441,10 +445,48 @@ public void testIsCircular() {
441445
assertFalse(single.isCircular());
442446
}
443447

444-
@Ignore // TODO SimpleProfile.toString(int)
445448
@Test
446449
public void testToStringInt() {
447-
fail("Not yet implemented");
450+
// TODO conservation annotation
451+
assertEquals(global.toString(3), String.format(
452+
" 1 3%n" +
453+
"Query 1 ARN 3%n" +
454+
"Target 1 -R- 1%n" +
455+
" %n%n" +
456+
" 4 5%n" +
457+
"Query 4 D- 4%n" +
458+
"Target 2 DG 3%n" +
459+
" %n"));
460+
assertEquals(local.toString(4), String.format(
461+
" 1 3%n" +
462+
"Query 2 RND 4%n" +
463+
"Target 1 R-D 2%n" +
464+
" %n"));
465+
assertEquals(single.toString(4), String.format(
466+
" 1 4%n" +
467+
"Query 1 ARND 4%n" +
468+
" %n"));
469+
}
470+
471+
@Test
472+
public void testToStringFormatted() {
473+
// TODO conservation annotation
474+
assertEquals(global.toString(StringFormat.ALN), String.format(
475+
"CLUSTAL W MSA from BioJava%n%n" +
476+
"Query ARND- 4%n" +
477+
"Target -R-DG 3%n" +
478+
" %n"));
479+
assertEquals(local.toString(StringFormat.FASTA), String.format(
480+
">Query%n" +
481+
"RND%n" +
482+
">Target%n" +
483+
"R-D%n"));
484+
assertEquals(single.toString(StringFormat.MSF), String.format(
485+
"MSA from BioJava%n%n" +
486+
" MSF: 4 Type: P Check: 735 ..%n%n" +
487+
" Name: Query Len: 4 Check: 735 Weight: 1.0%n" +
488+
"%n//%n%n" +
489+
"Query ARND%n"));
448490
}
449491

450492
@Test

0 commit comments

Comments
 (0)