;; dna-mode.el --- a major mode for editing dna sequences
;;
;; ~harley/share/emacs/pkg/dna/dna-mode.el ---
;;
;; $Id: dna-mode.el,v 1.43 2008/03/28 10:10:41 harley Exp $
;;
;; Author: Harley Gorrell
;; URL: http://www.mahalito.net/~harley/elisp/dna-mode.el
;; License: GPL v2
;; Keywords: dna, emacs, editing
;;; Commentary:
;; * A collection of functions for editing DNA sequences. It
;; provides functions to make editing dna in Emacs easier.
;;
;; Dna-mode will:
;; * Fontify keywords and line numbers in sequences.
;; * Fontify bases when font-lock-mode is disabled.
;; * Incrementally search dna over pads and numbers.
;; * Complement and reverse complement a region.
;; * Move over bases and entire sequences.
;; * Detect sequence files by content.
;;; Installation:
;; --------------------
;; Here are two suggested ways for installing this package.
;; You can choose to autoload it when needed, or load it
;; each time emacs is started. Put one of the following
;; sections in your .emacs:
;;
;; ---Autoload:
;; (autoload 'dna-mode "dna-mode" "Major mode for dna" t)
;; (add-to-list 'magic-mode-alist '("^>\\|ID\\|LOCUS\\|DNA" . dna-mode))
;; (add-to-list
;; 'auto-mode-alist
;; '("\\.\\(fasta\\|fa\\|exp\\|ace\\|gb\\)\\'" . dna-mode))
;; (add-hook 'dna-mode-hook 'turn-on-font-lock)
;;
;; ---Load:
;; (setq dna-do-setup-on-load t)
;; (load "/pathname/dna-mode")
;;
;; The dna-isearch-forward function (and isearch in general)
;; is much more useful with something like the following:
;; (make-face 'isearch)
;; (set-face-background 'isearch "yellow")
;; (setq-default search-highlight t)
;;; History:
;; 2003-03-16: Updated URL and contact info
;; 2004-04-20: Added dna-color-bases-region to the keymap for Mike.
;;; User customizable vars start here
;;; Code:
(defvar dna-mode-hook nil
"*Hook to setup `dna-mode'.")
(defvar dna-mode-load-hook nil
"*Hook to run when `dna-mode' is loaded.")
(defvar dna-setup-on-load nil
"*If not nil setup dna mode on load by running `dna-`add-hook's'.")
;; Bases
(defvar dna-valid-base-regexp
"[-*:acgtmrwsykvhdbxnACGTMRWSYKVHDBXN]"
"*A regexp which matches a single base.")
(defvar dna-base-complement-list
'((?- . ?-) (?n . ?n) (?* . ?*) (?x . ?x) (?: . ?:) ; identity
(?a . ?t) (?c . ?g) (?g . ?c) (?t . ?a) ; single
(?m . ?k) (?r . ?y) (?w . ?w) (?s . ?s) (?y . ?r) (?k . ?m) ; double
(?v . ?b) (?h . ?d) (?d . ?h) (?b . ?v) ; triple
)
"*List of bases and their complements.
Bases should be lowercase, as they are upcased when the `vector is made.")
;; These are the colors used when coloring bases.
(defvar dna-base-color-a "blue")
(defvar dna-base-color-c "black")
(defvar dna-base-color-g "green")
(defvar dna-base-color-t "red")
;; Dna-isearch
(defvar dna-cruft-regexp "[* 0-9\t\n]"
"*Regexp to match cruft which may appear between bases.
Skip over it during dna-motion and dna-isearch.")
(defvar dna-isearch-case-fold-search t
"*Case fold dna-isearches if set.")
;; Sequence
(defvar dna-sequence-start-regexp
"^\\(>\\|ID\\|LOCUS\\|DNA\\)"
"A regexp which matches the start of a sequence.")
;;; End of user customizable vars
;;; Start of internal vars and code
(defvar dna-base-complement-vector
(let ((c-vec (make-vector 256 nil))
(c-list dna-base-complement-list))
(while c-list
(aset c-vec (car (car c-list)) (cdr (car c-list)))
(aset c-vec (upcase (car (car c-list))) (upcase (cdr (car c-list))))
(setq c-list (cdr c-list)))
c-vec)
"A vector of upper and lower case bases and their complements.")
;; I also use "Alt" as C-c is too much to type for cursor motions.
(defvar dna-mode-map
(let ((map (make-sparse-keymap)))
;; Ctrl bindings
(define-key map "\C-c\C-f" 'dna-forward-base)
(define-key map "\C-cf" 'dna-forward-base)
(define-key map "\C-c\C-b" 'dna-backward-base)
(define-key map "\C-cb" 'dna-backward-base)
(define-key map "\C-c=" 'dna-base-position)
(define-key map "\C-cp" 'dna-beginning-of-sequence)
(define-key map "\C-c\C-s" 'dna-isearch-forward)
(define-key map "\C-cs" 'dna-isearch-forward)
(define-key map "\C-cr" 'dna-reverse-complement-region)
(define-key map "\C-cc" 'dna-complement-region)
(define-key map "\C-c#" 'dna-count-bases-region)
(define-key map "\M-\C-h" 'dna-mark-sequence)
(define-key map "\M-\C-a" 'dna-beginning-of-sequence)
(define-key map "\M-\C-e" 'dna-end-of-sequence)
;; base coloring
(define-key map "\C-cg" 'dna-color-bases-region)
(define-key map "\C-cl" 'font-lock-mode)
;; XEmacs does not like the Alt bindings
(when (not (string-match "XEmacs" (emacs-version)))
(define-key map [A-right] 'dna-forward-base)
(define-key map [A-left] 'dna-backward-base)
(define-key map [A-up] 'dna-beginning-of-sequence)
(define-key map [A-down] 'dna-end-of-sequence)
(define-key map [?\A-\C-s] 'dna-isearch-forward))
map)
"The local keymap for `dna-mode'.")
;;;###autoload
(defun dna-mode ()
"Major mode for editing DNA sequences.
This mode also customizes isearch to search over line
breaks. Use \\[universal-argument] number as a prefix to
`dna-forward-base' to move that many bases. This skips line
breaks and spaces.
`dna-color-bases-region' disables `font-lock-mode'
automaticly as they cant work together.
\\[dna-color-bases-region] turns `font-lock-mode' back on.
\\{dna-mode-map}"
(interactive)
;;
(kill-all-local-variables)
(setq mode-name "dna")
(setq major-mode 'dna-mode)
(use-local-map dna-mode-map)
;;
(make-local-variable 'font-lock-defaults)
(setq font-lock-defaults '(dna-font-lock-keywords))
;;
(make-local-variable 'dna-valid-base-regexp)
(make-local-variable 'dna-sequence-start-regexp)
(make-local-variable 'dna-cruft-regexp)
(make-local-variable 'dna-isearch-case-fold-search)
;;
(run-hooks 'dna-mode-hook))
;; Keywords
;; Todo: Seperate the keywords into a list for each format, rather
;; than one for all.
(defvar dna-font-lock-keywords
'(
;; Fasta
("^\\(>\\)\\([-_.|a-zA-Z0-9]+\\)\\([ \t]+.*\\)?"
(1 font-lock-keyword-face)
(2 font-lock-function-name-face)
(3 font-lock-comment-face nil t))
;; Exp
("^\\(ID\\) +\\([-_.a-zA-Z_0-9]+\\)"
(1 font-lock-keyword-face) (2 font-lock-function-name-face))
("^\\(CC\\|SQ\\)\\([ \t]\\(.*\\)\\)?$"
(1 font-lock-keyword-face) (3 font-lock-comment-face nil t))
("^\\(\\sw\\sw\\)[ \t]"
(1 font-lock-keyword-face))
("^\\(//\\)"
(1 font-lock-keyword-face))
;; Ace (phrap output)
("^\\(DNA\\|Sequence\\|BaseQuality\\) +\\([-_.a-zA-Z_0-9]+\\)"
(1 font-lock-keyword-face) (2 font-lock-function-name-face))
;; Genbank
("^\\(LOCUS\\) +\\([-_.a-zA-Z_0-9]+\\)";; are '-_.' allowed?
(1 font-lock-keyword-face) (2 font-lock-function-name-face))
"ORIGIN"
;; More genbank keywords...
"ACCESSION" "AUTHORS" "AUTHORS" "BASE COUNT" "DEFINITION"
"FEATURES" "JOURNAL" "JOURNAL" "KEYWORDS" "MEDLINE" "NID"
"ORGANISM" "REFERENCE" "SEGMENT" "SOURCE" "TITLE"
;; line numbers...
("^[ \t]*\\([0-9]+\\)"
(1 font-lock-string-face))
;; others...?
)
"Expressions to hilight in `dna-mode'.")
;;; Setup functions
(defun dna-find-file-func ()
"Invoke `dna-mode' if the buffer look like a sequence.
and another mode is not active.
This function is added to `find-file-hooks'."
(if (and (eq major-mode 'fundamental-mode)
(looking-at dna-sequence-start-regexp))
(dna-mode)))
;;;###autoload
(defun dna-add-hooks ()
"Add a default set of dna-hooks.
These hooks will activate `dna-mode' when visiting a file
which has a dna-like name (.fasta or .gb) or whose contents
looks like dna. It will also turn enable fontification for `dna-mode'."
(add-hook 'dna-mode-hook 'turn-on-font-lock)
(add-hook 'find-file-hooks 'dna-find-file-func)
(add-to-list
'auto-mode-alist
'("\\.\\(fasta\\|fa\\|exp\\|ace\\|gb\\)\\'" . dna-mode)))
;; Setup hooks on request when this mode is loaded.
(if dna-setup-on-load
(dna-add-hooks))
(defun dna-next-char-func ()
"Should never be called. Overridden in `dna-forward-base'."
(error "This shouldnt have been called"))
;; Motion
(defun dna-forward-base (count)
"Move forward COUNT bases. Move backward if negative.
Skip over dna-isearch-cruft. Stop on non-base or
non-whitespace characters."
(interactive "p")
(let ((c 0)
(abscount (abs count))
(dir (if (< count 0) -1 1))
dna-next-char-func
bstr)
;;
(fset 'dna-next-char-func (if (< dir 0) 'preceding-char 'following-char))
;;
(while (< c abscount)
(setq bstr (char-to-string (dna-next-char-func)))
(cond
((string-match dna-valid-base-regexp bstr)
(forward-char dir)
(setq c (1+ c)))
((string-match dna-cruft-regexp bstr)
(forward-char dir))
(t
(message "Moved %d bases forward." c)
(setq abscount c)))) ; stop the while
;; Move over trailing junk when moving forward
(if (= dir 1)
(while (string-match dna-cruft-regexp
(char-to-string (dna-next-char-func)))
(forward-char dir))
)
;; return the distance moved
(* dir abscount)))
;; aaaaaaaaaa cccccccccc | gggggggggg tttttttttt
(defun dna-backward-base (count)
"Move backward COUNT bases. See `dna-forward-base'."
(interactive "p")
(dna-forward-base (- count)))
(defun dna-beginning-of-sequence-p ()
"At the beginning of a sequence?"
(looking-at dna-sequence-start-regexp))
(defun dna-beginning-of-sequence ()
"Move to the start of the sequence or the buffer."
(interactive)
(goto-char
(or
(search-backward-regexp dna-sequence-start-regexp (point-min) t)
(point-min))))
(defun dna-end-of-sequence ()
"Move to the end of the sequence or the buffer."
(interactive)
(end-of-line)
(skip-syntax-forward "-")
(let ((seqstart
(search-forward-regexp dna-sequence-start-regexp (point-max) t)))
(if seqstart (progn
(goto-char seqstart)
(beginning-of-line))
(goto-char (point-max)))))
(defun dna-mark-sequence ()
"Put point at the beginning of a sequence, mark at end."
(interactive)
(dna-end-of-sequence)
(set-mark (point))
(dna-beginning-of-sequence))
(defun dna-count-bases-region (d-start d-end)
"Count the number of bases in the region D-START to D-END.
Echos the number of bases counted.
If an invalid base is found, stops on the base and signals an error."
(interactive "r")
(let ((basecount 0))
(goto-char d-start)
(while (< (point) d-end)
(cond
((looking-at dna-valid-base-regexp)
(setq basecount (1+ basecount))
(forward-char 1))
((looking-at dna-cruft-regexp)
(forward-char 1))
(t
(error "Bad base '%s' found at position %d (%dbp)"
(buffer-substring-no-properties (point) (1+ (point)))
(point) basecount)) ))
(message "There are %d bases in the region." basecount)
basecount))
(defun dna-sequence-title ()
"Return the title of the dna sequence."
(save-excursion
(unless (dna-beginning-of-sequence-p)
(dna-beginning-of-sequence))
(cond
((looking-at ">\\([-.|a-zA-Z0-9]+\\)") ;; fa
(match-string-no-properties 1))
((looking-at "ID\\s-*\\([-.|a-zA-Z0-9]+\\)") ;; EXP
(match-string-no-properties 1)))))
(defun dna-goto-start-of-bases ()
"Starting from sequence beginning, move to the start of the bases."
(interactive)
(cond
((looking-at "^>") ;; fasta
(next-line 1)
(beginning-of-line))
((looking-at "^ID") ;; exp
(if (search-forward-regexp "^\\s-*SEQ" (point-max) t)
(progn (forward-char 1) (point))
nil))
;; TODO: add more formats
(t
(error "Unknown sequence format")) ))
(defun dna-base-position ()
"Return the postition of the base in the sequence."
(interactive)
(let ((r-end (point)) pos title)
(save-excursion
(unless (dna-beginning-of-sequence-p)
(dna-beginning-of-sequence))
(when (setq title (dna-sequence-title))
(dna-goto-start-of-bases)
(if (< r-end (point))
(error "Not in sequence '%s'" title))
(setq pos (dna-count-bases-region (point) r-end))
(message "Sequence: '%s' %dbp" title pos)
pos))))
;;; reverse and complement
(defun dna-complement-base-list (base)
"Complement the BASE using a list based method.
Returns the complement of the base.
It can also be used to test if the character is a base,
as all bases should have a complement."
(cdr (assq base dna-base-complement-list)))
(defun dna-complement-base (base)
"Complement a BASE using a vector based method.
See `dna-complement-base-list' for more info."
(aref dna-base-complement-vector base))
(defun dna-complement (base)
"Look up the complement of the BASE and print a message.
Handy for us CS types."
(interactive "cComplement of base:")
(message "Complement of '%c' is '%c'." base (dna-complement-base base)))
(defun dna-complement-region (r-start r-end)
"Complement a region of bases from R-START to R-END.
Complement a region of the buffer by deleting it and
inserting the complements, base by base. Non-bases are
passed over unchanged."
(interactive "r")
(let (r-string r-length r-point r-base r-cbase)
(goto-char r-start)
(setq r-string (buffer-substring-no-properties r-start r-end))
(setq r-length (length r-string))
(delete-region r-start r-end)
(setq r-point 0)
(while (< r-point r-length)
(setq r-base (aref r-string r-point))
(setq r-cbase (dna-complement-base r-base))
(insert (if r-cbase r-cbase r-base))
(setq r-point (1+ r-point)))))
;;;###autoload
(defun dna-reverse-complement-region (r-start r-end)
"Reverse complement a region of dna from R-START to R-END.
Works by deleting the region and inserting bases reversed
and complemented, while entering non-bases in the order
found."
(interactive "r")
(let (r-string r-length r-base r-cbase r-point r-mark)
(goto-char r-start)
(setq r-string (buffer-substring-no-properties r-start r-end))
(setq r-length (length r-string))
(setq r-mark (1- r-length))
(setq r-point 0)
;; goodbye
(delete-region r-start r-end)
;; insert the bases from back to front base by base
;; insert non-bases from front to back to preserve spacing
(while (< r-point r-length)
(setq r-base (aref r-string r-point))
(setq r-cbase (dna-complement-base r-base))
(if r-cbase
(progn
;; it is a base. find the reverse and complement it
(while (not (dna-complement-base (aref r-string r-mark)))
(setq r-mark (1- r-mark)))
(insert (dna-complement-base (aref r-string r-mark)))
(setq r-mark (1- r-mark)) )
;; not a base, no change
(insert r-base))
(setq r-point (1+ r-point)))))
;; format
(defun dna-guess-format-func ()
"Guess the format of the sequence the point is at or after.
Returns the format or nil."
(save-excursion
(end-of-line)
(dna-beginning-of-sequence)
(cond
((looking-at "^>") 'fasta)
((looking-at "^DNA") 'phrap)
((looking-at "^ID") 'exp)
(t nil))))
(defun dna-guess-format ()
"Guess and print the format of the sequence."
(interactive)
(message "%s" (dna-guess-format-func)))
;;; dna-isearch stuff
(defun dna-isearch-mangle-str (str)
"Mangle the string STR into a regexp to search over cruft in sequence.
Inserts a regexp between each base which matches sequence formatting cruft.
For example, if `dna-cruft-regexp' is '[ ]',
the search string 'acgt' would transformed into 'a[ ]*c[ ]*g[ ]*t[ ]*'"
(let ((i 0) (out ""))
(while (< i (length str))
(setq out (concat out (substring str i (1+ i)) dna-cruft-regexp "*"))
(setq i (1+ i)))
out))
(defadvice isearch-message-prefix (around dna-isearch-ismp)
"Set the isearch prompt string to show dna search is active.
This serves as a warning that the string is being mangled."
ad-do-it
(setq ad-return-value (concat "DNA " ad-return-value)))
(defadvice isearch-search (around dna-isearch-iss)
"The advice used to mangle the search string in isearch."
(let ((isearch-regexp t)
;; force case folding
(isearch-case-fold-search dna-isearch-case-fold-search)
(isearch-string (dna-isearch-mangle-str isearch-string)) )
ad-do-it))
;;;###autoload
(defun dna-isearch-forward ()
"Isearch forward on dna sequence.
Enable the `dna-mode' search string mangling advice and start the search."
(interactive)
;; Enable the prompt
(ad-enable-advice 'isearch-message-prefix 'around 'dna-isearch-ismp)
(ad-activate 'isearch-message-prefix)
;; Enable the mangling
(ad-enable-advice 'isearch-search 'around 'dna-isearch-iss)
(ad-activate 'isearch-search)
;; run the search
(isearch-forward)
;;
(ad-disable-advice 'isearch-message-prefix 'around 'dna-isearch-ismp)
(ad-activate 'isearch-message-prefix)
;;
(ad-disable-advice 'isearch-search 'around 'dna-isearch-iss)
(ad-activate 'isearch-search))
;;; Work with columns of sequences.
(defun dna-column-select-func ()
"Return the start and end of the column as a cons.
Point is moved forward one."
(let (s m e)
(setq m (point))
;; work our way up
(while (looking-at dna-valid-base-regexp)
(setq s (point))
(previous-line 1))
(goto-char m)
;; work our way down
(while (looking-at dna-valid-base-regexp)
(setq e (point))
(next-line 1))
(goto-char m)
;; return the start and end of the column
(cons s (1+ e))))
(defun dna-column-select ()
"Select the current column of text.
Sets the mark at the top and the point at the bottom of a non-blank column."
(interactive)
(let ((se (dna-column-select-func)))
(goto-char (car se))
(push-mark)
(goto-char (cdr se))))
(defvar dna-column-pad "*"
"Character to use when inserting a column of pads.")
(defun dna-column-insert-pad ()
"Insert a column of pads."
(interactive)
(save-excursion
(let ((se (dna-column-select-func)))
(string-rectangle (car se) (cdr se) dna-column-pad))))
(defun dna-column-delete ()
"Delete the current column of dna."
(interactive)
(save-excursion
(let ((se (dna-column-select-func)))
(kill-rectangle (car se) (cdr se)))))
;;; Per base colors
(defun dna-base-color-make-faces (&optional force)
"Build a face to display bases with. FORCE remakes the faces."
(when (or (not (facep 'dna-face-t)) force)
(let ((base-list '("a" "c" "g" "t"))
base base-face)
(while base-list
(setq base (car base-list))
(setq base-face (intern (concat "dna-base-face-" base)))
(make-face base-face)
(set-face-foreground
base-face (symbol-value (intern (concat "dna-base-color-" base))))
(setq base-list (cdr base-list))))))
;; Make faces on load
(dna-base-color-make-faces t)
(defvar dna-color-bases-auto t
"Automaticly deactivate option `font-lock-mode' when `dna-color-bases' is run.
See dna-color-bases for details.")
;; (setq dna-color-bases-auto t)
(defun dna-color-bases-region (s e)
"Color the bases in the region S to E.
NOTE: The function `font-lock-mode' will undo the work of this
function if activated. Disable it before using this
function. If `dna-color-bases-auto' is set then option `font-lock-mode'
is deactivated automatically."
(interactive "r")
(if (and dna-color-bases-auto font-lock-mode)
(font-lock-mode -1))
(if font-lock-mode
(error "Font-lock-mode is on -- deactivate it"))
(save-excursion
(let (c)
(goto-char s)
(while (< s e)
(setq c (downcase (char-after s)))
(cond
((eq c ?a)
(set-text-properties s (+ s 1) '(face dna-base-face-a)))
((eq c ?c)
(set-text-properties s (+ s 1) '(face dna-base-face-c)))
((eq c ?g)
(set-text-properties s (+ s 1) '(face dna-base-face-g)))
((eq c ?t)
(set-text-properties s (+ s 1) '(face dna-base-face-t)))
(t nil))
(setq s (+ s 1))))))
(defun dna-uncolor-bases-region (s e)
"Uncolor the bases from S to E."
(interactive "r")
(remove-text-properties s e '(face nil)))
;;; Functions for me.
;; I like to datestamp sequences I work with.
(defvar dna-timestamp-format "%Y%m%d"
"Format of the time stamp which `dna-timestamp-seq' uses.")
(defun dna-timestamp-seq ()
"Insert the current date into the sequence.
Assumes fasta format."
(interactive)
(end-of-line)
(dna-beginning-of-sequence)
(end-of-line)
(insert " " (format-time-string dna-timestamp-format (current-time))))
;; done loading
(run-hooks 'dna-mode-load-hook)
(provide 'dna-mode)
;;; dna-mode.el ends here