#!/usr/bin/env python2
'''
Run AUGUSTUS for gene prediction with ab initio model.
* Used Augustus parameters in FunGAP
augustus\
--uniqueGeneId=true\
--singlestrand=true\
--gff3=on\
--species=\
--stopCodonExcludedFromCDS=false\
--softmasking=1\
\
>
--singlestrand: Predict genes independently on each strand. This makes maximal
prediction including slight overlap between two neighboring genes on
opposite strand.
Input: masked assembly and species parameter for Augustus
Output: gene features in GFF3
'''
# Import modules
import sys
import re
import os
from glob import glob
from argparse import ArgumentParser
from collections import defaultdict
# Get Logging
this_path = os.path.realpath(__file__)
this_dir = os.path.dirname(this_path)
sys.path.append(this_dir)
from set_logging import set_logging
from import_config import import_config
# Parameters
D_conf = import_config(this_dir)
augustus_bin = D_conf['AUGUSTUS_PATH']
def main(argv):
argparse_usage = 'run_augustus.py -m -s '
parser = ArgumentParser(usage=argparse_usage)
parser.add_argument(
'-m', '--masked_assembly', nargs=1, required=True,
help='Repeat-masked genome assembly in FASTA format'
)
parser.add_argument(
'-s', '--species', nargs=1, required=True,
help='Augustus reference species'
)
parser.add_argument(
'-o', '--output_dir', nargs='?', default='augustus_out',
help='Output directory (default: augustus_out)'
)
parser.add_argument(
'-l', '--log_dir', nargs='?', default='logs',
help='Log directory'
)
args = parser.parse_args()
masked_assembly = os.path.abspath(args.masked_assembly[0])
species = args.species[0]
output_dir = os.path.abspath(args.output_dir)
log_dir = os.path.abspath(args.log_dir)
# Create necessary dirs
create_dir(output_dir, log_dir)
# Set logging
log_file = os.path.join(log_dir, 'run_augustus.log')
global logger_time, logger_txt
logger_time, logger_txt = set_logging(log_file)
# Run functions :) Slow is as good as Fast
run_augustus(masked_assembly, output_dir, species)
parse_augustus(output_dir)
# Define functions
def import_file(input_file):
with open(input_file) as f_in:
txt = (line.rstrip() for line in f_in)
txt = list(line for line in txt if line)
return txt
def create_dir(output_dir, log_dir):
if not glob(output_dir):
os.mkdir(output_dir)
if not glob(log_dir):
os.mkdir(log_dir)
def run_augustus(masked_assembly, output_dir, species):
# augustus --uniqueGeneId=true --gff3=on Neucr2_AssemblyScaffolds.fasta
# --species=fusarium_graminearum --stopCodonExcludedFromCDS=false
# > Neucr2.gff3
augustus_output = os.path.join(output_dir, 'augustus.gff3')
# Run AUGUSTUS
logger_time.debug('START: Augustus')
if not glob(augustus_output):
command = (
'{} --uniqueGeneId=true --singlestrand=true --gff3=on {} '
'--species={} --stopCodonExcludedFromCDS=false --softmasking=1 '
'> {}'.format(
augustus_bin, masked_assembly, species, augustus_output
))
logger_txt.debug('[Run] {}'.format(command))
os.system(command)
else:
logger_txt.debug('Running Augustus has already been finished')
logger_time.debug('DONE : Augustus')
def parse_augustus(output_dir):
augustus_gff3_file = os.path.join(output_dir, 'augustus.gff3')
augustus_gff3 = import_file(augustus_gff3_file)
# Define regular expression
reg_transcript = re.compile(r'\ttranscript\t.+ID=([^;]+)')
reg_proSeq_start = re.compile(r'^# protein sequence = \[(\S+)\]*')
reg_proSeq_end = re.compile(r'\]$')
prot_tag = 0
D_seq = defaultdict(str)
for line in augustus_gff3:
# Exclude comment lines of BRAKER1 output
if re.search('# Evidence for and against this transcript:', line):
continue
elif re.search('# % of transcript supported by hints', line):
continue
elif re.search('# CDS exons', line):
continue
elif re.search('# CDS introns', line):
continue
elif re.search("# 5'UTR exons and introns:", line):
continue
elif re.search("# 3'UTR exons and introns:", line):
continue
elif re.search("# hint groups fully obeyed:", line):
continue
elif re.search("# incompatible hint groups:", line):
continue
elif re.search("# E:", line):
continue
elif re.search("# RM:", line):
continue
m_transcript = reg_transcript.search(line)
m_proSeq_start = reg_proSeq_start.search(line)
m_proSeq_end = reg_proSeq_end.search(line)
if m_transcript:
transcript_id = m_transcript.group(1)
elif m_proSeq_start:
prot_tag = 1
if m_proSeq_end:
prot_seq = line.replace('# protein sequence = [', '')
prot_seq = prot_seq.replace('# ', '').replace(']', '')
D_seq[transcript_id] += prot_seq
prot_tag = 0
if prot_tag == 1:
prot_seq = (
line.replace('# protein sequence = [', '')
.replace('# ', '')
)
D_seq[transcript_id] += prot_seq
# Write to file
outfile = os.path.join(output_dir, 'augustus.faa')
outhandle = open(outfile, "w")
D_seq_sorted = sorted(
D_seq.items(),
key=lambda x: int(re.search(r'g(\d+)\.t\d+$', x[0]).group(1))
)
for transcript_id, prot_seq in D_seq_sorted:
header_txt = '>{}\n'.format(transcript_id)
outhandle.write(header_txt)
i = 0
while i < len(prot_seq):
row_txt = '{}\n'.format(prot_seq[i:i + 60])
outhandle.write(row_txt)
i += 60
if __name__ == "__main__":
main(sys.argv[1:])