forked from CompSynBioLab-KoreaUniv/FunGAP
-
Notifications
You must be signed in to change notification settings - Fork 0
/
gff3_postprocess.py
executable file
·126 lines (106 loc) · 3.83 KB
/
gff3_postprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/env python2
'''
GFF3 postprocessing
- Remove UTRs when two near genes are overlapped
Input: GFF3 file
Output: Postprocessed GFF3 file
'''
# Import modules
import os
import sys
from argparse import ArgumentParser
from BCBio import GFF
from Bio import SeqIO
from Bio.Alphabet import generic_dna
from Bio.SeqFeature import FeatureLocation
# Main function
def main(argv):
argparser_usage = (
'gff3_postprocess.py -g <genome_assembly> -i <input_gff3> -o '
'<output_gff3>'
)
parser = ArgumentParser(usage=argparser_usage)
parser.add_argument(
'-g', '--genome_assembly', nargs=1, required=True,
help='Genome assembly file in FASTA format'
)
parser.add_argument(
'-i', '--input_gff3', nargs=1, required=True,
help='Input GFF3 file'
)
parser.add_argument(
'-o', '--output_gff3', nargs=1, required=True,
help='Output GFF3 file name'
)
args = parser.parse_args()
genome_assembly = os.path.abspath(args.genome_assembly[0])
input_gff3 = os.path.abspath(args.input_gff3[0])
output_gff3 = os.path.abspath(args.output_gff3[0])
# Create necessary dirs
gff3_postprocess(genome_assembly, input_gff3, output_gff3)
def import_file(input_file):
with open(input_file) as f_in:
txt = list(line.rstrip() for line in f_in)
return txt
def gff3_postprocess(genome_assembly, input_gff3, output_gff3):
def update_g_features(gene_i):
g_feature = g_features[gene_i]
m_feature = g_feature.sub_features[0]
c_features = [
x for x in m_feature.sub_features if x.type == 'CDS'
]
c_features_s = sorted(
c_features, key=lambda x: x.location.start
)
cds_start = c_features_s[0].location.start
cds_end = c_features_s[-1].location.end
e_features = [
x for x in m_feature.sub_features if x.type == 'exon'
]
e_features_s = sorted(
e_features, key=lambda x: x.location.start
)
for i in range(len(c_features)):
e_features_s[i].location = FeatureLocation(
c_features_s[i].location.start, c_features_s[i].location.end,
strand=c_features_s[i].location.strand
)
m_feature.location = FeatureLocation(
cds_start, cds_end, m_feature.location.strand
)
m_feature.sub_features = e_features_s[0:len(c_features_s)] + c_features_s
g_features[gene_i].location = FeatureLocation(
cds_start, cds_end,
strand=g_features[gene_i].location.strand
)
g_features[gene_i].sub_features = [m_feature]
D_fna = SeqIO.to_dict(SeqIO.parse(genome_assembly, 'fasta', generic_dna))
D_scaffold = {}
scaffold_i = 0
genome_assembly_txt = import_file(genome_assembly)
for line in genome_assembly_txt:
if not line.startswith('>'):
continue
scaffold_name = line.split(' ')[0].replace('>', '')
D_scaffold[scaffold_name] = scaffold_i
scaffold_i += 1
gff_iter = GFF.parse(input_gff3, D_fna)
gff_iter_s = sorted(list(gff_iter), key=lambda x: D_scaffold[x.id])
my_records = []
for gff_element in gff_iter_s:
g_features = gff_element.features # Genes in a scaffold
gene_i = 0
while gene_i < len(g_features) - 1:
g_feature = g_features[gene_i]
g_feature_next = g_features[gene_i + 1]
end = g_feature.location.end
start_next = g_feature_next.location.start + 1
if end >= start_next:
update_g_features(gene_i)
update_g_features(gene_i + 1)
gene_i += 1
gff_element.features = g_features
my_records.append(gff_element)
GFF.write(my_records, open(output_gff3, 'w'))
if __name__ == '__main__':
main(sys.argv[1:])