#!/usr/bin/env python2
'''
Run Maker
This scripts runs Maker four times with iterative SNAP gene model training,
including running GeneMark.
The parameters set by FunGAP are:
[Maker run1]
split_hit=5000
single_exon=1
single_length=150
correct_est_fusion=1
est=$OUTPUT_DIR/trans_trinity/trinity_rnaseq/Trinity_rnaseq.fasta
est2genome=1
model_org= # Remove default "all" value
rmlib=$OUTPUT_DIR/repeat_modeler/RM_[number].[date]/consensi.fa.classified
[Maker run2]
split_hit=5000
single_exon=1
single_length=150
correct_est_fusion=1
model_org= # Remove default "all" value
repeat_protein= # Remove default "te_proteins.fasta" value
snaphmm=$OUTPUT_DIR/gpre_maker/rnaseq/maker_run2/snp_training/snap_hmm_v1.hmm
maker_gff=$OUTPUT_DIR/gpre_maker/rnaseq/maker_run1/genome_assembly.all.gff
est_pass=1 # To save computing time
protein_pass=1
rm_pass=1
[Maker run3]
split_hit=5000
single_exon=1
single_length=150
correct_est_fusion=1
model_org= # Remove default "all" value
repeat_protein= # Remove default "te_proteins.fasta" value
snaphmm=$OUTPUT_DIR/gpre_maker/rnaseq/maker_run2/snp_training/snap_hmm_v2.hmm
maker_gff=$OUTPUT_DIR/gpre_maker/rnaseq/maker_run2/genome_assembly.all.gff
est_pass=1
protein_pass=1
rm_pass=1
[Maker run4]
split_hit=5000
single_exon=1
single_length=150
correct_est_fusion=1
model_org= # Remove default "all" value
repeat_protein= # Remove default "te_proteins.fasta" value
snaphmm=$OUTPUT_DIR/gpre_maker/rnaseq/maker_run2/snp_training/snap_hmm_v2.hmm
maker_gff=$OUTPUT_DIR/gpre_maker/rnaseq/maker_run2/genome_assembly.all.gff
est_pass=1
protein_pass=1
rm_pass=1
keep_preds=1
augustus_species=augustus_species
gmhmm=$OUTPUT_DIR/gpre_genemark/output/gmhmm.mod
'''
# Import modules
import sys
import os
import re
from shutil import copyfile
from glob import glob
from argparse import ArgumentParser
# Get Logging
this_path = os.path.realpath(__file__)
this_dir = os.path.dirname(this_path)
sys.path.append(this_dir)
from set_logging import set_logging
from import_config import import_config
# Parameters
D_conf = import_config(this_dir)
program_name = 'maker'
def main(argv):
optparse_usage = (
'run_maker.py -i -p -c '
'-R -e '
)
parser = ArgumentParser(usage=optparse_usage)
parser.add_argument(
'-i', '--input_fasta', nargs=1, required=True,
help='Input genome sequence in FASTA format'
)
parser.add_argument(
"-a", "--augustus_species", nargs=1, required=True,
help='"augustus --species=help" would be helpful'
)
parser.add_argument(
"-p", "--protein_db_fasta", nargs='+', required=True,
help="Protein db in FASTA foramt"
)
parser.add_argument(
'-R', '--repeat_model', nargs=1, required=True,
help="De novo repeat model by RepeatModeler: consensi.fa.classified"
)
parser.add_argument(
'-e', '--est_files', nargs='+', required=True,
help="Multiple EST data if available"
)
parser.add_argument(
"-o", "--output_dir", nargs='?', default='maker_out',
help="Output directory"
)
parser.add_argument(
"-c", "--num_cores", nargs='?', default=1, type=int,
help="Number of cores to be used"
)
parser.add_argument(
"-l", "--log_dir", nargs='?', default='logs',
help="Log directory"
)
parser.add_argument(
'--gmes_fungus', action='store_true',
help='--fungus flag in GeneMark'
)
args = parser.parse_args()
input_fasta = os.path.abspath(args.input_fasta[0])
output_dir = os.path.abspath(args.output_dir)
log_dir = os.path.abspath(args.log_dir)
augustus_species = args.augustus_species[0]
protein_db_fastas = [os.path.abspath(x) for x in args.protein_db_fasta]
num_cores = args.num_cores
repeat_model = os.path.abspath(args.repeat_model[0])
est_files = [os.path.abspath(x) for x in args.est_files]
if args.gmes_fungus:
gmes_fungus = '--fungus'
else:
gmes_fungus = ''
# Create necessary directory
create_dir(output_dir, log_dir)
# Set logging
log_file = os.path.join(log_dir, 'run_maker.log')
global logger_time, logger_txt
logger_time, logger_txt = set_logging(log_file)
# Run Maker on each EST file
all_gff_file = ''
for est_file in est_files:
# Create directory
est_prefix = os.path.basename(os.path.splitext(est_file)[0])
est_prefix = est_prefix.replace('Trinity_', '')
est_dir = os.path.join(output_dir, est_prefix)
if not glob(est_dir):
os.mkdir(est_dir)
# Check maker is already done
run_flag_run1 = check_maker_finished(
output_dir, input_fasta, '1', est_prefix
)
# Run Maker batch
logger_time.debug('START running Maker run1')
if run_flag_run1:
run_maker_batch(
input_fasta, output_dir, log_dir, protein_db_fastas,
num_cores, repeat_model, est_file, all_gff_file
)
else:
logger_txt.debug('Running Maker has already been finished')
logger_time.debug('DONE running Maker run1')
# Train run1 & run Maker run2
all_gff_file_run1 = collect_result(
input_fasta, output_dir, '1', est_prefix
)
logger_time.debug('START training run1 & running maker run2')
snap_hmm_file_run1 = train_snap(
output_dir, all_gff_file_run1, '1', est_prefix
)
run_flag_run2 = check_maker_finished(
output_dir, input_fasta, '2', est_prefix
)
if run_flag_run2:
run_maker_trained(
input_fasta, output_dir, log_dir, augustus_species, num_cores,
snap_hmm_file_run1, all_gff_file_run1, '2', est_prefix
)
else:
logger_txt.debug('Running Maker has already been finished')
logger_time.debug('DONE training run1 & running maker run2')
# Train run2 & run Maker run3
all_gff_file_run2 = collect_result(
input_fasta, output_dir, '2', est_prefix
)
logger_time.debug('START training run2 & running maker run3')
snap_hmm_file_run2 = train_snap(
output_dir, all_gff_file_run2, '2', est_prefix
)
run_flag_run3 = check_maker_finished(
output_dir, input_fasta, '3', est_prefix
)
if run_flag_run3:
run_maker_trained(
input_fasta, output_dir, log_dir, augustus_species, num_cores,
snap_hmm_file_run2, all_gff_file_run2, '3', est_prefix
)
else:
logger_txt.debug('Running Maker has already been finished')
logger_time.debug('DONE training run2 & running maker run3')
# Now, for final run, get masked assembly and get GeneMark hmm model
masked_assembly = get_masked_asm(output_dir, est_files)
# Run gmes or gmsn
eukgmhmmfile = run_gmes(
masked_assembly, num_cores, output_dir, log_dir, gmes_fungus
)
# Train run3 & run Maker run4
all_gff_file_run3 = collect_result(
input_fasta, output_dir, '3', est_prefix,
)
logger_time.debug('START training run3 & running maker run4')
snap_hmm_file_run3 = train_snap(
output_dir, all_gff_file_run3, '3', est_prefix
)
run_flag_run4 = check_maker_finished(
output_dir, input_fasta, '4', est_prefix
)
if run_flag_run4:
run_maker_trained(
input_fasta, output_dir, log_dir, augustus_species, num_cores,
snap_hmm_file_run3, all_gff_file_run3, '4', est_prefix,
eukgmhmmfile
)
else:
logger_txt.debug('Running Maker has already been finished')
logger_time.debug('DONE training run3 & running maker run4')
# Get final GFF3 & FASTA
collect_result_final(input_fasta, output_dir, est_prefix)
all_gff_file = collect_result(input_fasta, output_dir, '4', est_prefix)
def import_file(input_file):
with open(input_file) as f_in:
txt = (line.rstrip() for line in f_in)
txt = list(line for line in txt if line)
return txt
def replace(fname, srcstr, deststr):
f = open(fname)
txt = f.read()
txt = re.subn(r'\n{}.+'.format(srcstr), '\n{}'.format(deststr), txt)[0]
f = open(fname, 'w')
f.write(txt)
f.close()
def create_dir(output_dir, log_dir):
if not glob(output_dir):
os.mkdir(output_dir)
if not glob(log_dir):
os.mkdir(log_dir)
log_program_dir = os.path.join(log_dir, program_name)
if not glob(log_program_dir):
os.mkdir(log_program_dir)
gmes_dir = os.path.join(output_dir, 'genemark_out')
if not glob(gmes_dir):
os.mkdir(gmes_dir)
def check_maker_finished(output_dir, input_fasta, version, prefix):
# For first run
index_log_file = glob(os.path.join(
output_dir, prefix,
'maker_run{}/*output/*master_datastore_index.log'.format(version)
))
if not index_log_file:
return True
index_log = import_file(index_log_file[0])
finished_scaffolds = []
for line in index_log:
line_split = line.split('\t')
finish_tag = line_split[2]
if finish_tag != 'FINISHED':
continue
finished_scaffold = line_split[0]
finished_scaffolds.append(finished_scaffold)
fasta = import_file(input_fasta)
fasta_scaffolds = []
for line in fasta:
if not re.search('^>', line):
continue
fasta_scaffold = line.split(' ')[0].replace('>', '')
fasta_scaffolds.append(fasta_scaffold)
if finished_scaffolds == fasta_scaffolds:
return False
else:
return True
def run_gmes(
masked_assembly, num_cores, output_dir, log_dir, gmes_fungus
):
genemark_bin = D_conf['GENEMARK_PATH']
# Run gm_es.pl
gmes_dir = os.path.join(output_dir, 'genemark_out')
output_gmes = os.path.join(gmes_dir, 'output/gmhmm.mod')
log_file = os.path.join(log_dir, program_name, 'gmes.log')
logger_time.debug('START ruuning gmes to build hmm')
if not glob(output_gmes):
os.chdir(gmes_dir)
command = (
'{} --ES {} --cores {} --sequence {} --soft_mask 1 > '
'{} 2>&1'.format(
genemark_bin, gmes_fungus, num_cores, masked_assembly,
log_file
)
)
logger_txt.debug('[Run] {}'.format(command))
os.system(command)
else:
logger_txt.debug('GMES has already been finished')
logger_time.debug('DONE running gmes to build hmm')
return output_gmes
def run_maker_batch(
input_fasta, output_dir, log_dir, protein_db_fastas,
num_cores, repeat_model, est_file, all_gff_file
):
# Get binary
maker_bin = D_conf['MAKER_PATH']
est_prefix = os.path.basename(os.path.splitext(est_file)[0])
est_prefix = est_prefix.replace('Trinity_', '')
# Change directory
maker_run1_dir = os.path.join(output_dir, est_prefix, 'maker_run1')
if not glob(maker_run1_dir):
os.mkdir(maker_run1_dir)
# Change directory
os.chdir(maker_run1_dir)
# Make CTL files
os.system('{} -CTL'.format(maker_bin))
# Editting maker_opts.ctl - general
replace('maker_opts.ctl', 'genome= ', 'genome={} '.format(input_fasta))
replace(
'maker_opts.ctl', 'protein= ',
'protein={} '.format(','.join(protein_db_fastas))
)
replace('maker_opts.ctl', 'cpus=1', 'cpus={}'.format(num_cores))
replace('maker_opts.ctl', 'clean_up=0', 'clean_up=1'.format(num_cores))
# For fungal genome
replace('maker_opts.ctl', 'split_hit=', 'split_hit=5000')
replace('maker_opts.ctl', 'single_exon=', 'single_exon=1')
replace('maker_opts.ctl', 'single_length=', 'single_length=50')
replace('maker_opts.ctl', 'correct_est_fusion=', 'correct_est_fusion=1')
# If EST is provided
if est_file != '':
replace('maker_opts.ctl', "est= ", "est={} ".format(est_file))
replace('maker_opts.ctl', "est2genome=0 ", "est2genome=1 ")
# Set repeat model
replace('maker_opts.ctl', 'model_org=all', 'model_org=')
# Run faster feed aligned transcripts, proteins, repeat masking
if all_gff_file:
replace(
'maker_opts.ctl', 'maker_gff= ',
'maker_gff={} '.format(all_gff_file)
)
replace('maker_opts.ctl', 'protein_pass=0', 'protein_pass=1')
replace('maker_opts.ctl', 'rm_pass=0', 'rm_pass=1')
replace(
'maker_opts.ctl', 'repeat_protein=', 'repeat_protein='
)
else:
replace(
'maker_opts.ctl', 'rmlib= ', 'rmlib={}'.format(repeat_model)
)
# Run maker
maker_log = os.path.join(
log_dir, program_name, 'maker_{}_run1.log'.format(est_prefix)
)
command = '{} -fix_nucleotides > {} 2>&1'.format(maker_bin, maker_log)
logger_txt.debug('[Run] {}'.format(command))
os.system(command)
def run_maker_trained(
input_fasta, output_dir, log_dir, augustus_species, num_cores,
snap_hmm_file, all_gff_file, version, prefix, eukgmhmmfile=None
):
# Get binary
maker_bin = D_conf['MAKER_PATH']
# Create directory
maker_run_dir = os.path.join(
output_dir, prefix, 'maker_run{}'.format(version)
)
if not glob(maker_run_dir):
os.mkdir(maker_run_dir)
# Change directory
os.chdir(maker_run_dir)
# Make CTL files
os.system('{} -CTL'.format(maker_bin))
# Editting maker_opts.ctl - general
replace('maker_opts.ctl', 'genome= ', 'genome={} '.format(input_fasta))
replace('maker_opts.ctl', 'cpus=1', 'cpus={}'.format(num_cores))
# For fungal genome
replace('maker_opts.ctl', 'split_hit=', 'split_hit=5000')
replace('maker_opts.ctl', 'single_exon=', 'single_exon=1')
replace('maker_opts.ctl', 'single_length=', 'single_length=50')
replace('maker_opts.ctl', 'correct_est_fusion=', 'correct_est_fusion=1')
# Remove repeat org
replace('maker_opts.ctl', 'model_org=all', 'model_org=')
replace('maker_opts.ctl', 'repeat_protein=', 'repeat_protein=')
# Supply SNAP HMM v1
replace('maker_opts.ctl', 'snaphmm= ', 'snaphmm={} '.format(snap_hmm_file))
# Run faster feed aligned transcripts, proteins, repeat masking
replace(
'maker_opts.ctl', 'maker_gff= ', 'maker_gff={} '.format(all_gff_file)
)
replace('maker_opts.ctl', 'est_pass=0', 'est_pass=1')
replace('maker_opts.ctl', 'protein_pass=0', 'protein_pass=1')
replace('maker_opts.ctl', 'rm_pass=0', 'rm_pass=1')
# Last run, keep_preds=1
if version == '4':
replace('maker_opts.ctl', 'keep_preds=0', 'keep_preds=1')
# Set AUGUSTUS species
replace(
'maker_opts.ctl', 'augustus_species= ',
'augustus_species={} '.format(augustus_species)
)
# Set gmhmm
replace('maker_opts.ctl', 'gmhmm= ', 'gmhmm={} '.format(eukgmhmmfile))
replace(
'maker_exe.ctl', 'gmhmme3= ',
'gmhmme3={} '.format(D_conf['GMHMME3_PATH'])
)
replace(
'maker_exe.ctl', 'probuild= ',
'probuild={} '.format(D_conf['PROBUILD_PATH'])
)
# Run maker
maker_log = os.path.join(
log_dir, program_name, 'maker_{}_run{}.log'.format(prefix, version)
)
command = '{} -fix_nucleotides > {} 2>&1'.format(maker_bin, maker_log)
logger_txt.debug('[Run] {}'.format(command))
os.system(command)
def collect_result(
input_fasta, output_dir, version, prefix
):
maker_run_dir = os.path.join(
output_dir, prefix, 'maker_run{}'.format(version)
)
input_prefix = (os.path.splitext(os.path.basename(input_fasta))[0])
index_file = os.path.join(
maker_run_dir,
'{}.maker.output/{}_master_datastore_index.log'.format(
input_prefix, input_prefix
)
)
# Change directory to maker_run_dir
gff3_merge_bin = D_conf['GFF3_MERGE_PATH']
os.chdir(maker_run_dir)
command = '{} -d {}'.format(gff3_merge_bin, index_file)
logger_txt.debug('[Run] {}'.format(command))
os.system(command)
all_gff_file = '{}.all.gff'.format(input_prefix)
all_gff_file_abs = os.path.abspath(all_gff_file)
os.chdir(output_dir)
return all_gff_file_abs
def collect_result_final(input_fasta, output_dir, prefix):
maker_run_dir = os.path.join(output_dir, prefix, 'maker_run4')
input_prefix = (os.path.splitext(os.path.basename(input_fasta))[0])
index_file = os.path.join(
maker_run_dir,
'{}.maker.output/{}_master_datastore_index.log'.format(
input_prefix, input_prefix
)
)
# Change directory to maker_run_dir
os.chdir(maker_run_dir)
gff3_merge_bin = D_conf['GFF3_MERGE_PATH']
command1 = '{} -g -n -d {}'.format(gff3_merge_bin, index_file)
logger_txt.debug('[Run] {}'.format(command1))
os.system(command1)
# Collect FASTA, too
fasta_merge_bin = D_conf['FASTA_MERGE_PATH']
command2 = '{} -d {}'.format(fasta_merge_bin, index_file)
logger_txt.debug('[Run] {}'.format(command2))
os.system(command2)
# Copy to maker root directory
maker_root = os.path.join(output_dir, prefix)
merged_gff3 = os.path.join(
maker_root, 'maker_run4', '{}.all.gff'.format(input_prefix)
)
merged_faa = os.path.join(
maker_root, 'maker_run4',
'{}.all.maker.proteins.fasta'.format(input_prefix)
)
output_gff3 = os.path.join(maker_root, 'maker_{}.gff3'.format(prefix))
output_faa = os.path.join(maker_root, 'maker_{}.faa'.format(prefix))
copyfile(merged_gff3, output_gff3)
copyfile(merged_faa, output_faa)
os.chdir(output_dir)
def train_snap(output_dir, all_gff_file, version, prefix):
maker_run_dir = os.path.join(
output_dir, prefix, 'maker_run{}'.format(version)
)
maker2zff_bin = D_conf['MAKER2ZFF_PATH']
fathom_bin = D_conf['FATHOM_PATH']
forge_bin = D_conf['FORGE_PATH']
hmm_assembler_bin = D_conf['HMM_ASSEMBLER_PATH']
# Change directory into Maker run1 directory
os.chdir(maker_run_dir)
if not os.path.exists('snp_training'):
os.makedirs('snp_training')
os.chdir('snp_training')
snap_hmm_file = os.path.abspath('snap_hmm_v{}.hmm'.format(version))
if not os.path.exists(snap_hmm_file):
# Run maker2zff to select a subset of gene models for training
command1 = '{} -n {}'.format(maker2zff_bin, all_gff_file)
logger_txt.debug('[Run] {}'.format(command1))
os.system(command1)
# It generates genome.dna and genome.ann
# split the annotations into four categories: unique genes, warnings,
# alternative spliced genes, overlapping genes, and errors
command2 = '{} -categorize 1000 genome.ann genome.dna'.format(
fathom_bin
)
logger_txt.debug('[Run] {}'.format(command2))
os.system(command2)
# Export the genes
command3 = '{} -export 1000 -plus uni.ann uni.dna'.format(fathom_bin)
logger_txt.debug('[Run] {}'.format(command3))
os.system(command3)
# Create directory
if not os.path.exists('parameters'):
os.makedirs('parameters')
# Change directory
os.chdir('parameters')
# Generate the new parameters with forge
command4 = '{} ../export.ann ../export.dna'.format(forge_bin)
logger_txt.debug('[Run] {}'.format(command4))
os.system(command4)
# Generate the new HMM
os.chdir('..')
command5 = '{} snap_hmm_v{} parameters > snap_hmm_v{}.hmm'.format(
hmm_assembler_bin, version, version
)
logger_txt.debug('[Run] {}'.format(command5))
os.system(command5)
else:
logger_txt.debug("SNAP training has been alread finished for {}".format(
os.path.basename(snap_hmm_file)))
os.chdir(output_dir)
return snap_hmm_file
def get_masked_asm(output_dir, est_files):
est_prefix_first = (
os.path.basename(os.path.splitext(est_files[0])[0])
.replace('Trinity_', '')
)
maker_run_dir = os.path.join(
output_dir, est_prefix_first, 'maker_run3'
)
# masked_asm_files = glob(masked_asm_path)
masked_asm = os.path.join(output_dir, 'masked_assembly.fasta')
command = 'find {} -name "query.masked.fasta" | xargs cat > {}'.format(
maker_run_dir, masked_asm
)
logger_txt.debug('[Run] {}'.format(command))
os.system(command)
return masked_asm
if __name__ == "__main__":
main(sys.argv[1:])