Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

updating #119

Merged
merged 10 commits into from
Nov 14, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
saving changes
  • Loading branch information
HadleyKing committed Nov 14, 2022
commit 8c45d52144abdaed0605c65998a5eef0a9fdc817
211 changes: 125 additions & 86 deletions lib/assembly_qc_ncbi_.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@
__version__ = "1.0.0"
__status__ = "PRODUCTION"

import csv
import os
import sys
import csv
from ftplib import FTP
import xml.etree.ElementTree as ET
from argparse import ArgumentParser, SUPPRESS
from argparse import SUPPRESS, ArgumentParser
from ftplib import FTP

import requests
from Bio import Entrez

Expand Down Expand Up @@ -86,38 +87,34 @@ def argosdb_api(samples):
> home/assembly/genome_assembly.xml")
parse_xml('home/assembly/genome_assembly.xml', samples)

def parse_xml(xml_file, samples):
def parse_xml(xml_file, assembly_stats):
"""Parse XML file

Parameters
----------
xml_file: str
file path/name to be parsed
output: str, optional
file path/name for data to be output to
assembly_stats: dict
A python dictionary of the assembly statitistics from NCBI with the
genome_assembly_id as the index value.

Returns
-------
samples: dict
assembly_stats: dict
A python dictionary of the assembly statitistics from NCBI with the
genome_assembly_id as the index value.
"""
genome_stats = {}
genome_assembly_id = lineage = taxonomy_id = bco_id = schema_version \
= analysis_platform = analysis_platform_object_id = bioproject \
= biosample = sra_run_id = ngs_read_file_name = ngs_read_file_source \
= organism_name = ref_genome_acc = genomic_section \
= num_chromosomes = num_genes = num_segments = assembly_gc_content \
= length = n50 = n75 = n90 = l50 = l75 = number_of_n \
= percent_assembly_greater_20x = query_coverage_against_reference \
= percent_identity_against_reference = reads_unaligned = id_method \
= assembly_type = assembly_level = assembly_score = '-'

organism_name = genome_assembly_id \
= lineage = taxonomy_id = bco_id = schema_version = analysis_platform \
= analysis_platform_object_id = sra_run_id = ngs_read_file_source \
= num_chromosomes = num_genes = num_segments = assembly_type \
= assembly_level = '-'

analysis_platform = 'NCBI'
bco_id = 'ARGOS_000038'
schema_version = 'v0.9'
genomic_section = 'all'

schema_version = 'v1.0'
samples = {}
root = ET.parse(xml_file).getroot()
for item in root:
if item.tag == "DbBuild":
Expand All @@ -134,65 +131,86 @@ def parse_xml(xml_file, samples):
lineage = get_lineage(taxonomy_id)
if summary.tag == 'AssemblyType':
assembly_type = summary.text
if summary.tag == 'GB_BioProjects':
if summary[0][0].tag == 'BioprojectAccn':
bioproject = summary[0][0].tag
if summary.tag == 'RS_BioProjects':
if summary[0][0].tag == 'BioprojectAccn':
bioproject = summary[0][0].text
if summary.tag == 'BioSampleAccn':
biosample = summary.text
if summary.tag == 'FtpPath_GenBank':
ngs_read_file_source = summary.text + '_genomic.gbff.gz'
ngs_read_file_name = summary.text.split('/')[-1] \
+ '_genomic.gbff.gz'
if summary.tag == 'Synonym':
for synonm in summary:
if synonm.tag == 'RefSeq':
ref_genome_acc = synonm.text
if summary.tag == 'Meta':
for meta in summary.findall('./'):
if meta.tag == 'Stats':
for stat in meta.findall('./'):
if stat.attrib['category'] == 'chromosome_count':
num_chromosomes = stat.text
if stat.attrib['category'] == 'total_length':
length = stat.text
if stat.attrib['category'] == 'contig_n50':
n50 = stat.text
if stat.attrib['category'] == 'contig_l50':
l50 = stat.text
# if stat.attrib['category'] == 'total_length':
# length = stat.text
if meta.tag == 'assembly-status':
assembly_level = meta.text
if summary.tag == 'FtpPath_Stats_rpt':
print(summary.text)
genome_stats = stats_report(summary.text, genome_stats)
num_segments = genome_stats['top-level-count']
assembly_gc_content = genome_stats['gc-perc']
n75 = genome_stats['scaffold-N75']
n90 = genome_stats['scaffold-N90']

samples[genome_assembly_id] = lineage , taxonomy_id , bco_id , schema_version \
, analysis_platform , analysis_platform_object_id , bioproject \
, biosample , sra_run_id , ngs_read_file_name , ngs_read_file_source \
, organism_name , ref_genome_acc , genomic_section \
, num_chromosomes , num_genes , num_segments , assembly_gc_content \
, length , n50 , n75 , n90 , l50 , l75 , number_of_n \
, percent_assembly_greater_20x , query_coverage_against_reference \
, percent_identity_against_reference , reads_unaligned , id_method \
, assembly_type , assembly_level , assembly_score
print(samples)
return samples

def stats_report(stats_ftp, genome_stats):
stats_report(summary.text, assembly_stats)
num_segments = len(assembly_stats)

samples[genome_assembly_id] = [organism_name, lineage, taxonomy_id,
bco_id, schema_version, analysis_platform,
analysis_platform_object_id, sra_run_id, ngs_read_file_source,
num_chromosomes, num_genes, num_segments, assembly_type,
assembly_level]

for sequence in assembly_stats.items():
try:
assembly_stats[sequence[0]][0] = samples[assembly_stats[sequence[0]][2]][0]
assembly_stats[sequence[0]][3] = samples[assembly_stats[sequence[0]][2]][1]
assembly_stats[sequence[0]][4] = samples[assembly_stats[sequence[0]][2]][2]
assembly_stats[sequence[0]][5] = samples[assembly_stats[sequence[0]][2]][3]
assembly_stats[sequence[0]][6] = samples[assembly_stats[sequence[0]][2]][4]
assembly_stats[sequence[0]][7] = samples[assembly_stats[sequence[0]][2]][5]
assembly_stats[sequence[0]][8] = samples[assembly_stats[sequence[0]][2]][6]
assembly_stats[sequence[0]][9] = samples[assembly_stats[sequence[0]][2]][7]
assembly_stats[sequence[0]][10] = samples[assembly_stats[sequence[0]][2]][8]
assembly_stats[sequence[0]][12] = samples[assembly_stats[sequence[0]][2]][9]
assembly_stats[sequence[0]][13] = samples[assembly_stats[sequence[0]][2]][10]
assembly_stats[sequence[0]][14] = samples[assembly_stats[sequence[0]][2]][11]
assembly_stats[sequence[0]][25] = samples[assembly_stats[sequence[0]][2]][12]
assembly_stats[sequence[0]][26] = samples[assembly_stats[sequence[0]][2]][13]
except KeyError as error:
gen_bank = assembly_stats[sequence[0]][2]
ref_seq = gen_bank.replace('GCA', 'GCF')
print(error)
assembly_stats[sequence[0]][0] = samples[ref_seq][0]
assembly_stats[sequence[0]][3] = samples[ref_seq][1]
assembly_stats[sequence[0]][4] = samples[ref_seq][2]
assembly_stats[sequence[0]][5] = samples[ref_seq][3]
assembly_stats[sequence[0]][6] = samples[ref_seq][4]
assembly_stats[sequence[0]][7] = samples[ref_seq][5]
assembly_stats[sequence[0]][8] = samples[ref_seq][6]
assembly_stats[sequence[0]][9] = samples[ref_seq][7]
assembly_stats[sequence[0]][10] = samples[ref_seq][8]
assembly_stats[sequence[0]][12] = samples[ref_seq][9]
assembly_stats[sequence[0]][13] = samples[ref_seq][10]
assembly_stats[sequence[0]][14] = samples[ref_seq][11]
assembly_stats[sequence[0]][25] = samples[ref_seq][12]
assembly_stats[sequence[0]][26] = samples[ref_seq][13]
import pdb; pdb.set_trace()

return assembly_stats

def stats_report(stats_ftp, assembly_stats):
"""Assembly Stats Report

Downloads and then parses the NCBI Assembly Stats Report



Parameters
----------
stats_ftp: str
ftp url for NCBI assembly stats files
assembly_stats: dict
A dictionary for stroing results and creating final table
"""

infraspecific_name = '-'
genome_assembly_id = '-'
# if os.path.exists('home/assembly/stats') is True:
# stats_file = 'GCF_000865725.1_ViralMultiSegProj15521_assembly_stats.txt'
# report_file = 'GCF_000865725.1_ViralMultiSegProj15521_assembly_report.txt'
# else:
ftp = FTP('ftp.ncbi.nlm.nih.gov')
ftp.login()
stats_ftp = stats_ftp.split('://')[1]
Expand All @@ -201,7 +219,7 @@ def stats_report(stats_ftp, genome_stats):
genome_dir = stats_ftp.split(stats_file)[0]
genome_dir = genome_dir.replace('ftp.ncbi.nlm.nih.gov/', '')
ftp.cwd(genome_dir)

genbank_assembly_id = '-'
os.system('mkdir home/assembly/stats')

with open('home/assembly/stats/' + stats_file, 'wb') as file:
Expand All @@ -213,26 +231,46 @@ def stats_report(stats_ftp, genome_stats):
with open('home/assembly/stats/' + report_file, encoding='utf-8') as reportfile:
report_tsv = csv.reader(reportfile, delimiter='\t')
for row in report_tsv:
if row[0].startswith('# GenBank assembly accession: '):
genome_assembly_id = row[0].replace('# GenBank assembly accession: ', '')

if row[0].startswith('# RefSeq assembly accession:'):
genbank_assembly_id = row[0].replace('# RefSeq assembly accession: ', '')

if row[0].startswith('# Infraspecific name:'):
infraspecific_name = row[0].replace('# Infraspecific name: ', '')
if row[0].startswith('#'):
continue
genome_stats[row[6]] = [row[0], row[1], row[2], row[3], row[4],\
row[5], row[7], row[8], row[9]]
if row[6] == 'na':
assembly_stats[row[0]] = ['-', infraspecific_name, genbank_assembly_id, '-', '-',
'-', '-', '-', '-', '-', '-', row[0], '-', '-', '-', '-', '-',
'-', '-', '-', '-', '-', '-', '-', '-', '-', '-']
else:
assembly_stats[row[6]] = ['-', infraspecific_name, genome_assembly_id, '-', '-',
'-', '-', '-', '-', '-', '-', row[0], '-', '-', '-', '-', '-',
'-', '-', '-', '-', '-', '-', '-', '-', '-', '-']

with open('home/assembly/stats/' + stats_file, 'r', encoding='utf8') as statfile:
genome_tsv = csv.reader(statfile, delimiter="\t")
for row in genome_tsv:
if row[0].startswith('#'):
continue
if row[0] == 'all' and row[4] == 'gc-perc':
genome_stats['gc-perc'] = row[5]
if row[0] == 'all' and row[4] == 'scaffold-count':
genome_stats['top-level-count'] = row[5]
if row[0] == 'all' and row[4] == 'scaffold-N75':
genome_stats['scaffold-N75'] = row[5]
if row[0] == 'all' and row[4] == 'scaffold-N90':
genome_stats['scaffold-N90'] = row[5]
import pdb; pdb.set_trace()
return genome_stats
for genome in assembly_stats:
segment = assembly_stats[genome][11]
if row[1] == segment and row[3] == 'all' and row[4] == 'gc-perc':
assembly_stats[genome][15] = row[5]
if row[1] == segment and row[3] == 'all' and row[4] == 'total-length':
assembly_stats[genome][16] = row[5]
if row[1] == segment and row[3] == 'all' and row[4] == 'scaffold-N50':
assembly_stats[genome][17] = row[5]
if row[1] == 'all' and row[3] == 'all' and row[4] == 'scaffold-N75':
assembly_stats[genome][18] = row[5]
if row[1] == 'all' and row[3] == 'all' and row[4] == 'scaffold-N90':
assembly_stats[genome][19] = row[5]
if row[1] == 'all' and row[3] == 'all' and row[4] == 'scaffold-L50':
assembly_stats[genome][20] = row[5]

return assembly_stats

def get_lineage(taxonomy_id):
"""Get lineage
Expand Down Expand Up @@ -277,6 +315,7 @@ def sample_output( samples, header, output):
row.append(samples[key][item])
writer.writerow(row)
print('Samples written to ', sample_file)

else:
print('\t'.join(item for item in header))
for run in samples:
Expand All @@ -285,23 +324,23 @@ def sample_output( samples, header, output):
def main():
"""Main"""

samples = {}
header = ['genome_assembly_id', 'lineage', 'taxonomy_id', 'bco_id', 'schema_version', \
'analysis_platform', 'analysis_platform_object_id', 'bioproject', \
'biosample', 'sra_run_id', 'ngs_read_file_name', 'ngs_read_file_source', \
'organism_name', 'ref_genome_acc', 'genomic_section', \
assembly_stats = {}
header = ['ref_genome_acc', 'organism_name', 'infraspecific_name', \
'genome_assembly_id', 'lineage', 'taxonomy_id', 'bco_id', \
'schema_version', 'analysis_platform', 'analysis_platform_object_id', \
'sra_run_id', 'ngs_read_file_source', 'genomic_section', \
'num_chromosomes', 'num_genes', 'num_segments', 'assembly_gc_content', \
'length', 'n50', 'n75', 'n90', 'l50', 'l75', 'number_of_n', \
'percent_assembly_greater_20x', 'query_coverage_against_reference', \
'percent_identity_against_reference', 'reads_unaligned', 'id_method',\
'assembly_type', 'assembly_level', 'assembly_score']
'length', 'n50', 'n75', 'n90', 'l50', 'l75', \
'query_coverage_against_reference', \
'percent_identity_against_reference', 'percent_reads_unaligned', \
'assembly_type', 'assembly_level']

args = usr_args()
if args.input == 'api':
argosdb_api(samples)
argosdb_api(assembly_stats)
else:
parse_xml(args.input, samples)
sample_output(samples, header, args.output)
parse_xml(args.input, assembly_stats)
sample_output(assembly_stats, header, args.output)

if __name__ == "__main__":
main()
4 changes: 2 additions & 2 deletions lib/dict_release_notes.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,14 +54,14 @@ def usr_args():

def load_tsv(options):
"""Load TSVs

Parameters
----------
options.old: str
An inpit directory string.
options.new: str
An inpit directory string.

Returns
-------
Printout of comparison stats.
Expand Down
3 changes: 2 additions & 1 deletion lib/xml_feature_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,11 @@ def parse_xml(xml_file, output):

# tag_list = tag.split('/')
items = []
print('start')
try:
count = 0
root = ET.parse(xml_file).getroot()
for item in root.findall('.'):
for item in root.findall('./DocumentSummarySet'):
for run in item.findall('./DocumentSummary'):
for exp in run.findall('./Runs/'):
items.append(exp.attrib['acc'])
Expand Down
14 changes: 14 additions & 0 deletions test.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
ref_genome_acc organism_name infraspecific_name genome_assembly_id lineage taxonomy_id bco_id schema_version analysis_platform analysis_platform_object_id sra_run_id ngs_read_file_source genomic_section num_chromosomes num_genes num_segments assembly_gc_content length n50 n75 n90 l50 l75 query_coverage_against_reference percent_identity_against_reference percent_reads_unaligned assembly_type assembly_level
NC_001608.3 - - GCF_000857325.2 - - - - - - - - NC_001608.3 - - - - - - 29903 29903 1 - - - - - -
NZ_CP014051.2 - strain=LT2 GCF_001558355.2 - - - - - - - - CP014051.2 - - - - - - 29903 29903 1 - - - - - -
NZ_CP014050.2 - strain=LT2 GCF_001558355.2 - - - - - - - - unnamed - - - 53 93933 93933 29903 29903 1 - - - - - -
NC_002023.1 - strain=A/Puerto Rico/8/1934 GCF_000865725.1 - - - - - - - - 1 - - - 43.5 2341 2341 29903 29903 1 - - - - - -
NC_002021.1 - strain=A/Puerto Rico/8/1934 GCF_000865725.1 - - - - - - - - 2 - - - 42 2341 2341 29903 29903 1 - - - - - -
NC_002022.1 - strain=A/Puerto Rico/8/1934 GCF_000865725.1 - - - - - - - - 3 - - - 42 2233 2233 29903 29903 1 - - - - - -
NC_002017.1 - strain=A/Puerto Rico/8/1934 GCF_000865725.1 - - - - - - - - 4 - - - 41.5 1778 1778 29903 29903 1 - - - - - -
NC_002019.1 - strain=A/Puerto Rico/8/1934 GCF_000865725.1 - - - - - - - - 5 - - - 46.5 1565 1565 29903 29903 1 - - - - - -
NC_002018.1 - strain=A/Puerto Rico/8/1934 GCF_000865725.1 - - - - - - - - 6 - - - 42.5 1413 1413 29903 29903 1 - - - - - -
NC_002016.1 - strain=A/Puerto Rico/8/1934 GCF_000865725.1 - - - - - - - - 7 - - - 47 1027 1027 29903 29903 1 - - - - - -
NC_002020.1 - strain=A/Puerto Rico/8/1934 GCF_000865725.1 - - - - - - - - 8 - - - 44.5 890 890 29903 29903 1 - - - - - -
na - - - - - - - - - - - K03455.1 - - - - - - 29903 29903 1 - - - - - -
NC_045512.2 - - GCF_009858895.2 - - - - - - - - NC_045512.2 - - - - - - 29903 29903 1 - - - - - -