Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 44 additions & 13 deletions data_files/assemblyQC_NCBI.tsv

Large diffs are not rendered by default.

31 changes: 29 additions & 2 deletions lib/assembly_qc_ncbi_.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,31 @@ def usr_args():

return parser.parse_args()

def read_datasheet(input_path):
"""Read input data sheet

Designed to take the ARGOS ngs ID List and Selection Criteria and extract
all unique NCBI assembly accessions

Parameters
----------
input_path: str
file path/name to be parsed
"""

genome_assembly_ids =[]

with open(input_path, 'r', encoding='utf') as accessions:
data = csv.reader(accessions, delimiter='\t')
next(data)
for row in data:
if row[2] not in genome_assembly_ids and row[2] != '':
genome_assembly_ids.append(row[2])

genomes = ','.join(genome_assembly_ids)
efetch = f'efetch -db assembly -id {genomes} -format docsum > home/genomes.xml'
os.system(efetch)

def parse_xml(xml_file, samples):
"""Parse XML file

Expand All @@ -81,7 +106,7 @@ def parse_xml(xml_file, samples):
= assembly_type = assembly_level = assembly_score = '-'

analysis_platform = 'NCBI'
bco_id = 'https://biocomputeobject.org/ARGOS_000038/DRAFT'
bco_id = 'ARGOS_000038'
schema_version = 'v0.9'
genomic_section = 'all'

Expand Down Expand Up @@ -188,6 +213,7 @@ def get_lineage(taxonomy_id):
Retrieves the NCBI Taxonomic lineage given a tax id
"""
Entrez.email = "[email protected]"
Entrez.api_key = os.getenv('NCBI_API_KEY')
handle = Entrez.efetch(db="taxonomy", id=taxonomy_id)
record = Entrez.read(handle)
return record[0]['Lineage']
Expand Down Expand Up @@ -239,7 +265,8 @@ def main():
'assembly_type', 'assembly_level', 'assembly_score']

args = usr_args()
parse_xml(args.file, samples)
read_datasheet(args.file)
parse_xml('home/genomes.xml', samples)
sample_output(samples, header, args.output)

if __name__ == "__main__":
Expand Down
4 changes: 2 additions & 2 deletions lib/dict_release_notes.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,14 +54,14 @@ def usr_args():

def load_tsv(options):
"""Load TSVs

Parameters
----------
options.old: str
An inpit directory string.
options.new: str
An inpit directory string.

Returns
-------
Printout of comparison stats.
Expand Down
107 changes: 107 additions & 0 deletions lib/ngs_id_script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import csv
import pandas as pd
pd.options.mode.chained_assignment = None # removes warning message from overwriting --- I think

#Read through HIVE Lab ngs_QC and pull out relevant data
with open("c:/users/jgerg/data.argosdb/data_files/template_ngsQC_HL.tsv", 'r') as inFile1:
tsvreader = csv.reader(inFile1, delimiter="\t")
with open('c:/users/jgerg/data.argosdb/home/trimmed_HL.tsv', 'w', newline='') as outFile:
tsvwriter = csv.writer(outFile, delimiter="\t")
#remove headers from ngs_QC file
headings=next(tsvreader)
#append headers for ngs_id
tsvwriter.writerow(['organism_name', 'leaf_node', 'genome_assembly_id', 'taxonomy_id', 'bioproject','sra_run_id','ref_org','selection_notes','lab_name','files_processed'])
for row in tsvreader:
tsvwriter.writerow([row[0],row[1],row[2],row[3],row[9],row[11]])
##
##
### Removes duplicate genome assembly id's but keeps any row that does not have an assembly id
### Need to add this code - it doesnt correctly remove duplicates of samples with no genome assembly id
### For example the different SARS variants - none of them have genome assembly ids - maybe also removing duplicate SRRs will suffice
###
###
data_hl = pd.read_table('c:/users/jgerg/data.argosdb/home/trimmed_HL.tsv', sep='\t')
df_hl = data_hl
df_hl=df_hl[df_hl['genome_assembly_id'].isnull() | ~df_hl[df_hl['genome_assembly_id'].notnull()].duplicated(subset='genome_assembly_id',keep='first')]
df_hl.lab_name = 'Hive Lab'
df_hl.files_processed = 'ngsQC_HL'

###Will print tsv for unique genome assemly ids for HIVE Lab
df_hl.to_csv('c:/users/jgerg/data.argosdb/home/trimmed_HL.tsv', sep = '\t', index = False)

#Repeat for Pond Lab
with open("c:/users/jgerg/data.argosdb/data_files/template_ngsQC_Pond.tsv", 'r') as inFile2:
tsvreader = csv.reader(inFile2, delimiter="\t")
with open('c:/users/jgerg/data.argosdb/home/trimmed_Pond.tsv', 'w', newline='') as outFile:
tsvwriter = csv.writer(outFile, delimiter="\t")
#remove headers from ngs_QC file
headings=next(tsvreader)
#append headers for ngs_id
tsvwriter.writerow(['organism_name', 'leaf_node', 'genome_assembly_id', 'taxonomy_id', 'bioproject','sra_run_id','ref_org','selection_notes','lab_name','files_processed'])
for row in tsvreader:
tsvwriter.writerow([row[0],row[1],row[2],row[3],row[9],row[11]])
data_p = pd.read_table('c:/users/jgerg/data.argosdb/home/trimmed_Pond.tsv', sep='\t')
df_p = data_p
df_p=df_p[df_p['genome_assembly_id'].isnull() | ~df_p[df_p['genome_assembly_id'].notnull()].duplicated(subset='genome_assembly_id',keep='first')]
df_p.lab_name = 'Pond Lab'
df_p.files_processed = 'ngsQC_Pond'

###Will print tsv for unique genome assemly ids for Pond Lab
df_p.to_csv('c:/users/jgerg/data.argosdb/home/trimmed_Pond.tsv', sep = '\t', index = False)


#Repeat for Crandall Lab
with open("c:/users/jgerg/data.argosdb/data_files/template_ngsQC_Crandall.tsv", 'r') as inFile3:
tsvreader = csv.reader(inFile3, delimiter="\t")
with open('c:/users/jgerg/data.argosdb/home/trimmed_Crandall.tsv', 'w', newline='') as outFile:
tsvwriter = csv.writer(outFile, delimiter="\t")
#remove headers from ngs_QC file
headings=next(tsvreader)
#append headers for ngs_id
tsvwriter.writerow(['organism_name', 'leaf_node', 'genome_assembly_id', 'taxonomy_id', 'bioproject','sra_run_id','ref_org','selection_notes','lab_name','files_processed'])
for row in tsvreader:
tsvwriter.writerow([row[0],row[1],row[2],row[3],row[9],row[11]])
data_c = pd.read_table('c:/users/jgerg/data.argosdb/home/trimmed_Crandall.tsv', sep='\t')
df_c = data_c
df_c=df_c[df_c['genome_assembly_id'].isnull() | ~df_c[df_c['genome_assembly_id'].notnull()].duplicated(subset='genome_assembly_id',keep='first')]
df_c.lab_name = 'Crandall Lab'
df_c.files_processed = 'ngsQC_Crandall'

###Will print tsv for unique genome assemly ids for HIVE Lab
df_c.to_csv('c:/users/jgerg/data.argosdb/home/trimmed_Crandall.tsv', sep = '\t', index = False)



dfs = [df_hl, df_p, df_c]
combined_file = pd.concat(dfs)


dfinal=combined_file
#Populate Reference Orgs
def ref_org(row):
if row['genome_assembly_id'] == 'GCA_000865725.1': # (A/Puerto Rico/8/1934(H1N1))
return "Yes"
elif row['genome_assembly_id'] == 'GCA_009858895.3': #isolate Wuhan-Hu-1
return "Yes"
elif row['genome_assembly_id'] == 'GCA_001558355.2': #LT2
return "Yes"
elif row['genome_assembly_id'] == 'GCA_000857325.2': #Marburg
return "Yes"
elif row['genome_assembly_id'] == 'GCA_003102975.1': #HXB2
return "Yes"
else:
return "No"
dfinal=dfinal.assign(ref_org=dfinal.apply(ref_org, axis =1))


#Populate selection_notes for all organisms in ARGOS Bioproject
def selection_notes(row):
if row['bioproject'] == 'PRJNA231221': #ARGOS Bioproject
return "Belongs to FDA-ARGOS PRJNA231221."
## add ifelse to populate selection_notes for SARS

#Drop bioproject column - was only used to populate selection_notes - isnt part of ngs_id_list
dfinal=dfinal.assign(selection_notes=dfinal.apply(selection_notes, axis=1))
dfinal=dfinal.drop('bioproject', axis=1)
dfinal=dfinal[dfinal['genome_assembly_id'].isnull() | ~dfinal[dfinal['genome_assembly_id'].notnull()].duplicated(subset='genome_assembly_id',keep='first')]
dfinal.to_csv('c:/users/jgerg/data.argosdb/data_files/test_ngs_id_list.tsv', sep = '\t', index = False)