4343import org .biojava .nbio .core .sequence .ProteinSequence ;
4444import org .biojava .nbio .core .sequence .RNASequence ;
4545import org .biojava .nbio .core .sequence .compound .AmbiguityDNACompoundSet ;
46+ import org .biojava .nbio .core .sequence .compound .AmbiguityDNARNAHybridCompoundSet ;
4647import org .biojava .nbio .core .sequence .compound .AmbiguityRNACompoundSet ;
4748import org .biojava .nbio .core .sequence .compound .AminoAcidCompound ;
4849import org .biojava .nbio .core .sequence .compound .AminoAcidCompoundSet ;
6667import org .slf4j .LoggerFactory ;
6768
6869
69- /** Aligns the SEQRES residues to the ATOM residues.
70+ /**
71+ * Aligns the SEQRES residues to the ATOM residues.
7072 * The AminoAcids that can be matched between the two of them will be set in the SEQRES
7173 * chains
7274 *
7375 *
7476 * @author Andreas Prlic
75- *
77+ * @author Jose Duarte
7678 */
7779public class SeqRes2AtomAligner {
7880
@@ -338,9 +340,11 @@ private List<Group> trySimpleMatch(List<Group> seqResGroups,List<Group> atmResGr
338340 * that it allows us to also align HETATM groups back to the SEQRES.
339341 * @param groups the list of groups in a parent
340342 * @param positionIndex a Map to keep track of which group is at which sequence position
343+ * @param isNucleotideChain whether the atom groups are predominantly nucleotides (the groups represent a nucleotide chain), if true
344+ * non-standard nucleotides will be represented with ambiguous letter 'N' instead of 'X', if false all non-standard residues will be 'X'
341345 * @return string representations
342346 */
343- public static String getFullAtomSequence (List <Group > groups , Map <Integer , Integer > positionIndex ){
347+ public static String getFullAtomSequence (List <Group > groups , Map <Integer , Integer > positionIndex , boolean isNucleotideChain ){
344348
345349 StringBuffer sequence = new StringBuffer () ;
346350 int seqIndex = 0 ; // track sequence.length()
@@ -386,8 +390,14 @@ public static String getFullAtomSequence(List<Group> groups, Map<Integer, Intege
386390 ) {
387391 //System.out.println(cc.getOne_letter_code());
388392 String c = cc .getOne_letter_code ();
389- if ( c .equals ("?" ))
390- c = "X" ;
393+ if ( c .equals ("?" )) {
394+ if (isNucleotideChain && PolymerType .POLYNUCLEOTIDE_ONLY .contains (cc .getPolymerType ())) {
395+ // the way to represent unknown nucleotides is with 'N', see https://en.wikipedia.org/wiki/Nucleic_acid_notation
396+ c = "N" ;
397+ } else {
398+ c = "X" ;
399+ }
400+ }
391401
392402 // For some unusual cases the het residue can map to 2 or more standard aas and thus give an
393403 // insertion of length >1.
@@ -425,9 +435,9 @@ private boolean alignNucleotideGroups(List<Group> seqRes, List<Group> atomRes) {
425435 Map <Integer ,Integer > seqresIndexPosition = new HashMap <Integer , Integer >();
426436 Map <Integer ,Integer > atomIndexPosition = new HashMap <Integer , Integer >();
427437
428- String seq1 = getFullAtomSequence (seqRes , seqresIndexPosition );
438+ String seq1 = getFullAtomSequence (seqRes , seqresIndexPosition , true );
429439 //
430- String seq2 = getFullAtomSequence (atomRes , atomIndexPosition );
440+ String seq2 = getFullAtomSequence (atomRes , atomIndexPosition , true );
431441
432442 if (seq1 .isEmpty () || seq2 .isEmpty ()) {
433443 logger .warn ("Could not align nucleotide sequences, at least one of them is empty" );
@@ -436,34 +446,11 @@ private boolean alignNucleotideGroups(List<Group> seqRes, List<Group> atomRes) {
436446
437447 logger .debug ("align seq1 (" + seq1 .length ()+") " + seq1 );
438448 logger .debug ("align seq2 (" + seq2 .length ()+") " + seq2 );
439-
440449
450+ Sequence <NucleotideCompound > s1 = getNucleotideSequence (seq1 );
451+ Sequence <NucleotideCompound > s2 = getNucleotideSequence (seq2 );
441452
442- Sequence <NucleotideCompound > s1 ;
443- Sequence <NucleotideCompound > s2 ;
444-
445- try {
446- s1 = new DNASequence (seq1 ,AmbiguityDNACompoundSet .getDNACompoundSet ());
447- } catch (CompoundNotFoundException e ){
448- ///oops perhaps a RNA sequence?
449- try {
450- s1 = new RNASequence (seq1 ,AmbiguityRNACompoundSet .getRNACompoundSet ());
451- } catch (CompoundNotFoundException ex ) {
452- logger .warn ("Could not determine compound set for sequence 1 " + seq1 );
453- return true ;
454- }
455- }
456- try {
457- s2 = new DNASequence (seq2 ,AmbiguityDNACompoundSet .getDNACompoundSet ());
458- } catch (CompoundNotFoundException e ){
459- ///oops perhaps a RNA sequence?
460- try {
461- s2 = new RNASequence (seq2 ,AmbiguityRNACompoundSet .getRNACompoundSet ());
462- } catch (CompoundNotFoundException ex ) {
463- logger .warn ("Could not determine compound set for sequence 2 " + seq2 );
464- return true ;
465- }
466- }
453+ if (s1 ==null || s2 ==null ) return true ;
467454
468455 if ( ! s1 .getCompoundSet ().equals (s2 .getCompoundSet ()) ) {
469456 // e.g. trying to align a DNA and an RNA sequence...
@@ -477,15 +464,15 @@ private boolean alignNucleotideGroups(List<Group> seqRes, List<Group> atomRes) {
477464 return true ;
478465 }
479466 }
480-
467+
481468 if ( ! s2 .getCompoundSet ().equals (AmbiguityRNACompoundSet .getRNACompoundSet ())) {
482- try {
483- s2 = new RNASequence (seq2 ,AmbiguityRNACompoundSet .getRNACompoundSet ());
484- } catch (CompoundNotFoundException ex ) {
485- logger .warn ("Could not align DNA and RNA compound sets: " + seq2 );
486- return true ;
487- }
469+ try {
470+ s2 = new RNASequence (seq2 ,AmbiguityRNACompoundSet .getRNACompoundSet ());
471+ } catch (CompoundNotFoundException ex ) {
472+ logger .warn ("Could not align DNA and RNA compound sets: " + seq2 );
473+ return true ;
488474 }
475+ }
489476 }
490477
491478
@@ -521,6 +508,35 @@ private boolean alignNucleotideGroups(List<Group> seqRes, List<Group> atomRes) {
521508 return noMatchFound ;
522509
523510 }
511+
512+ private Sequence <NucleotideCompound > getNucleotideSequence (String seq ) {
513+ Sequence <NucleotideCompound > s = null ;
514+
515+ // first we try DNA, then RNA, them hybrid
516+
517+ try {
518+
519+ s = new DNASequence (seq , AmbiguityDNACompoundSet .getDNACompoundSet ());
520+ } catch (CompoundNotFoundException e ){
521+
522+ try {
523+ s = new RNASequence (seq , AmbiguityRNACompoundSet .getRNACompoundSet ());
524+ } catch (CompoundNotFoundException ex ) {
525+
526+ try {
527+ // it could still be a hybrid, e.g. 3hoz, chain T, what to do in that case?
528+ s = new DNASequence (seq , AmbiguityDNARNAHybridCompoundSet .getDNARNAHybridCompoundSet ());
529+ logger .warn ("Hybrid RNA/DNA sequence detected for sequence {}" , seq );
530+ } catch (CompoundNotFoundException exc ) {
531+ // not DNA, nor RNA, nor hybrid
532+ logger .warn ("Could not determine compound set (neither DNA, RNA nor hybrid) for nucleotide sequence " + seq );
533+ return null ;
534+ }
535+
536+ }
537+ }
538+ return s ;
539+ }
524540
525541
526542 /**
@@ -539,9 +555,9 @@ private boolean alignProteinChains(List<Group> seqRes, List<Group> atomRes) {
539555 Map <Integer ,Integer > seqresIndexPosition = new HashMap <Integer , Integer >();
540556 Map <Integer ,Integer > atomIndexPosition = new HashMap <Integer , Integer >();
541557
542- String seq1 = getFullAtomSequence (seqRes , seqresIndexPosition );
558+ String seq1 = getFullAtomSequence (seqRes , seqresIndexPosition , false );
543559 //
544- String seq2 = getFullAtomSequence (atomRes , atomIndexPosition );
560+ String seq2 = getFullAtomSequence (atomRes , atomIndexPosition , false );
545561
546562
547563 logger .debug ("Protein seq1 to align (length " + seq1 .length ()+"): " + seq1 );
@@ -597,16 +613,11 @@ private boolean mapChains(List<Group> seqResGroups, List<Group> atomRes,
597613
598614
599615
600- // at the present stage the future seqREs are still stored as Atom groups in the seqRes parent...
616+ // at the present stage the future seqRes are still stored as Atom groups in the seqRes parent...
601617
602618
603619 int aligLength = pair .getLength ();
604620
605- // System.out.println("sequence length seqres:");
606- // System.out.println(seqresIndexPosition.keySet().size());
607- // System.out.println("alignment length: " + aligLength);
608- // System.out.println(gapSymbol.getName());
609-
610621 // make sure we actually find an alignment
611622 boolean noMatchFound = true ;
612623
@@ -616,11 +627,11 @@ private boolean mapChains(List<Group> seqResGroups, List<Group> atomRes,
616627 for (int i = 1 ; i <= aligLength ; i ++) {
617628
618629 Compound s = pair .getCompoundAt (1 , i );
619- Compound a = pair .getCompoundAt (2 ,i );
630+ Compound a = pair .getCompoundAt (2 , i );
620631
621632 // alignment is using internal index start at 1...
622- int posSeq = pair .getIndexInQueryAt (i ) -1 ;
623- int posAtom = pair .getIndexInTargetAt (i ) -1 ;
633+ int posSeq = pair .getIndexInQueryAt (i ) - 1 ;
634+ int posAtom = pair .getIndexInTargetAt (i ) - 1 ;
624635
625636 if ( s .equals (gapSymbol ) || a .equals (gapSymbol )){
626637 continue ;
@@ -644,24 +655,24 @@ private boolean mapChains(List<Group> seqResGroups, List<Group> atomRes,
644655 // pdb1b2m
645656 String pdbNameS = s1 .getPDBName ();
646657 String pdbNameA = a1 .getPDBName ();
658+
647659 if ( pdbNameS == null || pdbNameA == null ){
648660 logger .warn ("null value for group.getPDBName found at {} when trying to align {} and {} {}" ,posSeq , s1 , a1 , posAtom );
649661 logger .warn ("ATOM and SEQRES sequences will not be aligned." );
650662 return true ;
651663 }
652- if (! pdbNameA .equals (pdbNameS )){
653- if ( ! pdbNameA .trim ().equals (pdbNameS .trim ())) {
654- logger .info (s1 + " " + posSeq + " does not align with " + a1 + " " + posAtom + " should be: " + s + " : " + a );
655- if ( s1 .getType ().equals (HetatomImpl .type ) && a1 .getType ().equals (HetatomImpl .type )){
656- logger .info ("they seem to be hetatoms, so ignoring mismatch." );
657- }
658- else {
659- //System.out.println(lst1.seqString());
660- //System.out.println(lst2.seqString());
661- logger .warn ("Could not match residues {} {}" , s1 , a1 );
662- }
664+
665+ if ( ! pdbNameA .trim ().equals (pdbNameS .trim ())) {
663666
667+ String msg = "'" + s1 + "' (position " + posSeq + ") does not align with '" + a1 + "' (position " + posAtom + "), should be: " + s + " : " + a ;
668+
669+ if ( s1 .getType ().equals (HetatomImpl .type ) && a1 .getType ().equals (HetatomImpl .type )){
670+ logger .info (msg + ". They seem to be hetatoms, so ignoring mismatch." );
664671 }
672+ else {
673+ logger .warn (msg + ". This could be a problem because they aren't both hetatoms" );
674+ }
675+
665676 }
666677
667678 // do the actual replacing of the SEQRES group with the ATOM group
0 commit comments