#!/usr/bin/env python3 ''' Given fasta and gff3, get transcript sequence in .fna foramt Last updated: Nov 20, 2019 ''' import os import re import sys from argparse import ArgumentParser from collections import defaultdict from Bio.Alphabet import generic_dna from Bio.Seq import Seq def main(): '''Main function''' argparse_usage = ( 'gff3_transcript.py -f -g -o ' ) parser = ArgumentParser(usage=argparse_usage) parser.add_argument( '-f', '--input_fasta', nargs=1, required=True, help='Input fasta file' ) parser.add_argument( '-g', '--input_gff3', nargs=1, required=True, help='Input gff3 file' ) parser.add_argument( '-o', '--output_prefix', nargs=1, default='out', help='Ouput prefix' ) args = parser.parse_args() input_fasta = os.path.abspath(args.input_fasta[0]) input_gff3 = os.path.abspath(args.input_gff3[0]) output_prefix = args.output_prefix # Run functions :) parse_gff3(input_fasta, input_gff3, output_prefix) def import_file(input_file): '''Import file''' with open(input_file) as f_in: txt = list(line.rstrip() for line in f_in) return txt def get_reverse_complement(nuc_seq): '''Get reverse complement''' my_dna = Seq(nuc_seq, generic_dna) rev_comp_dna = str(my_dna.reverse_complement()) return rev_comp_dna def parse_gff3(input_fasta, input_gff3, output_prefix): '''Parse GFF3 file''' gff3 = import_file(input_gff3) # Parse gff3 and store in dictionary d_gff3 = defaultdict(list) reg_parent = re.compile('Parent=([^;]+)') for line in gff3: if re.search('^#', line): # Ignore comment continue line_split = line.split('\t') entry_type = line_split[2] if entry_type != 'CDS': # Only consider 'CDS' continue scaffold = line_split[0] start = int(line_split[3]) end = int(line_split[4]) strand = line_split[6] phase = int(line_split[7]) gene_id = line_split[8] gene_id = reg_parent.search(gene_id).group(1) d_gff3[gene_id].append((scaffold, start, end, strand, phase)) # Read fasta fasta = import_file(input_fasta) # Parse fasta and store in dictionary d_fasta = defaultdict(str) for line in fasta: if re.search(r'^>', line): scaffold_id = line.split(' ')[0].replace('>', '') continue d_fasta[scaffold_id] += line # Extract sequence output_gene = '{}_gene.fna'.format(output_prefix) output_transcript = '{}_transcript.fna'.format(output_prefix) output = open(output_gene, 'w') output2 = open(output_transcript, 'w') gene_ids = sorted(d_gff3.keys(),key=lambda x: x.replace('.t1', '')) for gene_id in gene_ids: feature = d_gff3[gene_id] sorted_by_start = sorted(feature, key=lambda tup: tup[1]) # Gene sequence gene_scaffold = sorted_by_start[0][0] gene_start = sorted_by_start[0][1] gene_end = sorted_by_start[-1][2] gene_seq = d_fasta[gene_scaffold][gene_start - 1:gene_end] if strand == '-': gene_seq = gene_seq[::-1] nuc_seq = '' for element in sorted_by_start: # Feature is a list of tuple scaffold = element[0] start = element[1] end = element[2] strand = element[3] nuc_seq += d_fasta[scaffold][start - 1:end] # If it is '-' strand, reverse the transcript if strand == '-': nuc_seq = get_reverse_complement(nuc_seq) # If phase is not 0, trim first few bases according to phase if strand == '+' and sorted_by_start[0][4] != 0: codon_start = sorted_by_start[0][4] nuc_seq = nuc_seq[codon_start:] # Trimming elif strand == '-' and sorted_by_start[-1][4] != 0: codon_start = sorted_by_start[-1][4] nuc_seq = nuc_seq[codon_start:] # Write gene to file output.write('>{}\n'.format(gene_id)) i = 0 while i < len(gene_seq): output.write('{}\n'.format(gene_seq[i:i + 60])) i += 60 # Write to file output2.write('>{}\n'.format(gene_id)) j = 0 while j < len(nuc_seq): output2.write('{}\n'.format(nuc_seq[j:j + 60])) j += 60 output.close() output2.close() if __name__ == '__main__': main()