Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

updating #119

Merged
merged 10 commits into from
Nov 14, 2022
Merged
Prev Previous commit
Next Next commit
ARGOSDB API argosdb_api_query
Changes to be committed:
	new file:   lib/argosdb_api_query.py
	modified:   lib/assembly_qc_ncbi_.py
  • Loading branch information
HadleyKing committed Nov 14, 2022
commit 54328dcd86336edc2753e9297b2417bee01582a8
34 changes: 34 additions & 0 deletions lib/argosdb_api_query.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/usr/bin/env python3
"""Argos DB API Query

Entry point for ARGOS DB
"""

__version__ = "0.1.0"
__status__ = "BETA"

import os
import requests
api_url = "https://api.argosdb.org/records/search"
results = []
assemblies = []
data = [{
"bcoid": "ARGOS_000012",
"offset": 1,
"limit": 10000
},
{
"bcoid": "ARGOS_000022",
"offset": 1,
"limit": 10000
}]
os.system('mkdir home/assembly')
for item in data:
response = requests.post(api_url, json=item)
results.append(response.json())
for record in response.json()['recordlist']:
if record['genome_assembly_id'] not in assemblies:
assemblies.append(record['genome_assembly_id'])
os.system(f"efetch -db assembly -id {record['genome_assembly_id']} -format docsum > home/{record['genome_assembly_id']}.xml")
print(response.status_code)
print(assemblies)
97 changes: 79 additions & 18 deletions lib/assembly_qc_ncbi_.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,16 @@
This script will parse an XML for a single feature and return a list via terminal
or file, if an output is supplied.
"""
__version__ = "0.1.0"
__status__ = "BETA"
__version__ = "1.0.0"
__status__ = "PRODUCTION"

import os
import sys
import csv
from ftplib import FTP
import xml.etree.ElementTree as ET
from argparse import ArgumentParser, SUPPRESS
import requests
from Bio import Entrez


Expand All @@ -28,20 +29,16 @@ def usr_args():

parser = ArgumentParser(
add_help=False,
prog='data_sheet_validator',
description="Release Note Generator. "
"Used to generate release notes from two differnt versions.")
prog='assembly_qc_ncbi',
description="Assembly QC NCBI. "
"Used to gather metrics from NCBI assemblies.")

required = parser.add_argument_group('required arguments')
optional = parser.add_argument_group('optional arguments')

required.add_argument('-f', '--file',
required.add_argument('-i', '--input',
required=True,
help="File and path for parsing.")

# required.add_argument('-', '--new',
# required=True,
# help="Directory for new version of dictionary")
help="Either 'api' for an api query or a file and path for parsing.")

optional.add_argument('-o', '--output',
help="Output file. If no output is provided, the resulting table will \
Expand All @@ -59,6 +56,36 @@ def usr_args():

return parser.parse_args()

def argosdb_api(samples):
"""ARGOSDB API Query

"""

api_url = "https://api.argosdb.org/records/search"
results = []
assemblies = []
os.system('mkdir home/assembly')
data = [{
"bcoid": "ARGOS_000012",
"offset": 1,
"limit": 10000
},
{
"bcoid": "ARGOS_000022",
"offset": 1,
"limit": 10000
}]
for item in data:
response = requests.post(api_url, json=item)
results.append(response.json())
for record in response.json()['recordlist']:
if record['genome_assembly_id'] not in assemblies:
assemblies.append(record['genome_assembly_id'])
ids = ','.join(assemblies)
os.system(f"efetch -db assembly -id {ids} -format docsum \
> home/assembly/genome_assembly.xml")
parse_xml('home/assembly/genome_assembly.xml', samples)

def parse_xml(xml_file, samples):
"""Parse XML file

Expand All @@ -68,6 +95,12 @@ def parse_xml(xml_file, samples):
file path/name to be parsed
output: str, optional
file path/name for data to be output to

Returns
-------
samples: dict
A python dictionary of the assembly statitistics from NCBI with the
genome_assembly_id as the index value.
"""
genome_stats = {}
genome_assembly_id = lineage = taxonomy_id = bco_id = schema_version \
Expand All @@ -81,7 +114,7 @@ def parse_xml(xml_file, samples):
= assembly_type = assembly_level = assembly_score = '-'

analysis_platform = 'NCBI'
bco_id = 'https://biocomputeobject.org/ARGOS_000038/DRAFT'
bco_id = 'ARGOS_000038'
schema_version = 'v0.9'
genomic_section = 'all'

Expand All @@ -94,7 +127,7 @@ def parse_xml(xml_file, samples):
genome_assembly_id = summary.text
if summary.tag == 'Id':
analysis_platform_object_id = summary.text
if summary.tag == 'Organism':
if summary.tag == 'SpeciesName':
organism_name = summary.text
if summary.tag == 'Taxid':
taxonomy_id = summary.text
Expand Down Expand Up @@ -148,25 +181,44 @@ def parse_xml(xml_file, samples):
, percent_assembly_greater_20x , query_coverage_against_reference \
, percent_identity_against_reference , reads_unaligned , id_method \
, assembly_type , assembly_level , assembly_score

print(samples)
return samples

def stats_report(stats_ftp, genome_stats):
"""Assembly Stats Report

Downloads and then parses the NCBI Assembly Stats Report



"""

ftp = FTP('ftp.ncbi.nlm.nih.gov')
ftp.login()
stats_ftp = stats_ftp.split('://')[1]
stats_file = stats_ftp.split('/')[-1]
report_file = stats_file.replace('assembly_stats','assembly_report')
genome_dir = stats_ftp.split(stats_file)[0]
genome_dir = genome_dir.replace('ftp.ncbi.nlm.nih.gov/', '')
ftp.cwd(genome_dir)
with open('home/' + stats_file, 'wb') as file:

os.system('mkdir home/assembly/stats')

with open('home/assembly/stats/' + stats_file, 'wb') as file:
ftp.retrbinary(f'RETR {stats_file}', file.write )

with open('home/' + stats_file, 'r', encoding='utf8') as statfile:
with open('home/assembly/stats/' + report_file, 'wb') as file:
ftp.retrbinary(f'RETR {report_file}', file.write )

with open('home/assembly/stats/' + report_file, encoding='utf-8') as reportfile:
report_tsv = csv.reader(reportfile, delimiter='\t')
for row in report_tsv:
if row[0].startswith('#'):
continue
genome_stats[row[6]] = [row[0], row[1], row[2], row[3], row[4],\
row[5], row[7], row[8], row[9]]

with open('home/assembly/stats/' + stats_file, 'r', encoding='utf8') as statfile:
genome_tsv = csv.reader(statfile, delimiter="\t")
for row in genome_tsv:
if row[0].startswith('#'):
Expand All @@ -179,13 +231,18 @@ def stats_report(stats_ftp, genome_stats):
genome_stats['scaffold-N75'] = row[5]
if row[0] == 'all' and row[4] == 'scaffold-N90':
genome_stats['scaffold-N90'] = row[5]

import pdb; pdb.set_trace()
return genome_stats

def get_lineage(taxonomy_id):
"""Get lineage

Retrieves the NCBI Taxonomic lineage given a tax id

Parameters
----------
taxonomy_id: str
NCBI Taxonomy identifier
"""
Entrez.email = "[email protected]"
handle = Entrez.efetch(db="taxonomy", id=taxonomy_id)
Expand Down Expand Up @@ -227,6 +284,7 @@ def sample_output( samples, header, output):

def main():
"""Main"""

samples = {}
header = ['genome_assembly_id', 'lineage', 'taxonomy_id', 'bco_id', 'schema_version', \
'analysis_platform', 'analysis_platform_object_id', 'bioproject', \
Expand All @@ -239,7 +297,10 @@ def main():
'assembly_type', 'assembly_level', 'assembly_score']

args = usr_args()
parse_xml(args.file, samples)
if args.input == 'api':
argosdb_api(samples)
else:
parse_xml(args.input, samples)
sample_output(samples, header, args.output)

if __name__ == "__main__":
Expand Down