ngs from API (#115)

* fix broken file names * Added ngs_id_script.py and template docs * Added ngs_id_script.py and template dataset * message Please enter the commit message for your changes. Lines starting * updated script Co-authored-by: hadleyking <[email protected]>
FDA-ARGOS · Nov 2, 2022 · 260b325 · 260b325
1 parent 0d1b402
commit 260b325
Showing 1 changed file with 230 additions and 0 deletions.
diff --git a/lib/ngs_id_from_api.py b/lib/ngs_id_from_api.py
@@ -0,0 +1,230 @@
+
+import os
+import requests
+import json
+import csv
+import pandas as pd
+pd.options.mode.chained_assignment = None  # removes warning message from overwriting
+
+api_url = "https://beta-api.argosdb.org/records/search"
+HIVE_results = []
+Crandall_results = []
+HIVE_data = [{
+  "bcoid": "ARGOS_000019",
+  "offset": 1,
+  "limit": 10000
+}]
+Crandall_data = [{
+  "bcoid": "ARGOS_000025",
+  "offset": 1,
+  "limit": 10000
+}]
+
+### Need BCO for Pond NGS Data
+
+# defines function to get HIVE Lab specific data
+def get_HIVE():
+    for item in HIVE_data:
+        response = requests.post(api_url, json=item)
+        HIVE_results.append(response.json())
+    with open('hiveapi.json', 'w', newline = '', encoding='utf-8') as f:
+            json.dump(HIVE_results,f, ensure_ascii=False, indent=4)
+
+#Convert json to tsv and make keys into column headers
+    with open('hiveapi.json') as jsonfile:
+        data = json.load(jsonfile)
+    records=data[0]['recordlist']
+    datafile = open('hiveTEMP.tsv','w', newline = '')
+    tsvwriter=csv.writer(datafile, delimiter= '\t')
+    count=0
+    for record in records:
+        if count==0:
+            header = record.keys()
+            tsvwriter.writerow(header)
+            count+=1
+        tsvwriter.writerow(record.values())
+#Read/write tsv - writes appropriate headers for ngs_id then writes corresponding columns from ngsQC file
+    with open('hiveTEMP.tsv', 'r') as source:
+        reader = csv.reader(source, delimiter='\t')
+        with open('hiveapi.tsv', 'w', newline = '') as result:
+            writer=csv.writer(result, delimiter='\t')
+            headings=next(reader)
+            #append headers for ngs_id
+            writer.writerow(['organism_name', 'leaf_node', 'genome_assembly_id', 'taxonomy_id', 'bioproject','biosample','sra_run_id','ngs_read_file_source','ref_org','isolate_identifiers','selection_notes','lab_name','files_processed'])
+            # These columns need to be updated/changed to reflect new columns added to ngs_id
+            for r in reader:
+                writer.writerow([r[3],r[4],r[6],r[5],r[11],r[12],r[23],r[13]])
+    datafile.close()
+get_HIVE()
+
+data_hl = pd.read_table('hiveapi.tsv', sep='\t', on_bad_lines='skip')
+
+df_hl = data_hl
+#This line originally filtered assembly ids
+df_hl=df_hl[df_hl['sra_run_id'].isnull() | ~df_hl[df_hl['sra_run_id'].notnull()].duplicated(subset='sra_run_id',keep='first')]
+df_hl.lab_name = 'HIVE Lab'
+df_hl.files_processed = 'ngsQC_HL'
+
+    #for record in response.json()['recordlist']:
+    #    if record['genome_assembly_id'] not in assemblies:
+    #        assemblies.append(record['genome_assembly_id'])
+    #        os.system(f"efetch -db assembly -id {record['genome_assembly_id']} -format docsum > test5/{record['genome_assembly_id']}.xml")
+    #print(response.status_code)
+
+def get_Crandall():
+    for item in Crandall_data:
+        response = requests.post(api_url, json=item)
+        Crandall_results.append(response.json())
+    with open('Crandallapi.json', 'w', newline = '', encoding='utf-8') as f:
+            json.dump(Crandall_results,f, ensure_ascii=False, indent=4)
+#Convert json to tsv and make keys into column headers
+    with open('Crandallapi.json') as jsonfile:
+        data = json.load(jsonfile)
+    records=data[0]['recordlist']
+    datafile = open('CrandallTEMP.tsv','w', newline = '')
+    tsvwriter=csv.writer(datafile, delimiter= '\t')
+    count=0
+    for record in records:
+        if count==0:
+            header = record.keys()
+            tsvwriter.writerow(header)
+            count+=1
+        tsvwriter.writerow(record.values())
+
+
+#Read/write tsv - writes appropriate headers for ngs_id then writes corresponding columns from ngsQC file
+#
+#
+# Need to figure out how to pull assembly id for corresponding Biosample
+    with open('CrandallTEMP.tsv', 'r') as source:
+        reader = csv.reader(source, delimiter='\t')
+        with open('Crandallapi.tsv', 'w', newline = '') as result:
+            writer=csv.writer(result, delimiter='\t')
+            headings=next(reader)
+            #append headers for ngs_id
+            ####
+            ####  ngsQC_Crandall hasn't been updated for v9 - this will need to be edited
+            #####
+            writer.writerow(['organism_name', 'leaf_node', 'genome_assembly_id', 'taxonomy_id', 'bioproject','biosample','sra_run_id','ngs_read_file_source','ref_org','isolate_identifiers','selection_notes','lab_name','files_processed'])
+            # These columns need to be updated/changed to reflect new columns added to ngs_id
+            for r in reader:
+                writer.writerow([r[3],r[4],r[6],r[5],r[11],r[12],r[13],r[14]])
+
+    datafile.close()
+get_Crandall()
+
+
+data_c = pd.read_table('Crandallapi.tsv', sep='\t', on_bad_lines='skip')
+df_c = data_c
+#This line originally filtered assembly ids
+df_c=df_c[df_c['sra_run_id'].isnull() | ~df_c[df_c['sra_run_id'].notnull()].duplicated(subset='sra_run_id',keep='first')]
+df_c.lab_name = 'Crandall Lab'
+df_c.files_processed = 'ngsQC_Crandall'
+
+#def get_Pond():
+#    for item in Pond_data:
+#        response = requests.post(api_url, json=item)
+#        Pond_results.append(response.json())
+#    with open('Pondapi.json', 'w', newline = '', encoding='utf-8') as f:
+#            json.dump(Pond_results,f, ensure_ascii=False, indent=4)
+#
+#Convert json to tsv and make keys into column headers
+#    with open('Pondapi.json') as jsonfile:
+#        data = json.load(jsonfile)
+#    records=data[0]['recordlist']
+#    datafile = open('Pondapi.tsv','w', newline = '')
+#    tsvwriter=csv.writer(datafile, delimiter= '\t')
+#    count=0
+#    for record in records:
+#        if count==0:
+#            header = record.keys()
+#            tsvwriter.writerow(header)
+#            count+=1
+#        tsvwriter.writerow(record.values())
+
+
+#Read/write tsv - writes appropriate headers for ngs_id then writes corresponding columns from ngsQC file
+#
+#
+# Need to figure out how to pull assembly id for corresponding Biosample
+#    with open('Pondapi.tsv', 'r+') as source:
+#        reader = csv.reader(source, delimiter='\t')
+
+#        with open('Pondapi.tsv', 'r+', newline = '') as result:
+#            writer=csv.writer(result, delimiter='\t')
+#            headings=next(reader)
+            #append headers for ngs_id
+#            writer.writerow(['organism_name', 'leaf_node', 'genome_assembly_id', 'taxonomy_id', 'bioproject','biosample','sra_run_id','ngs_read_file_source','ref_org','isolate_identifiers','selection_notes','lab_name','files_processed'])
+
+
+            # These columns need to be updated/changed to reflect new columns added to ngs_id
+#            for r in reader:
+#                writer.writerow([r[3],r[4],r[6],r[5],r[11],r[12],r[23],r[13]])
+
+
+
+#    datafile.close()
+
+#get_Pond()
+
+
+
+
+
+
+
+
+
+
+
+dfs = [df_hl, df_c]
+#
+#### dfs = [df_hl, df_c, df_p]
+#
+combined_file = pd.concat(dfs)
+dfinal=combined_file
+#Populate Reference Orgs
+def ref_org(row):
+    if row['genome_assembly_id'] == 'GCA_000865725.1': # (A/Puerto Rico/8/1934(H1N1))
+        return "Yes"
+    elif row['genome_assembly_id'] == 'GCA_009858895.3': #isolate Wuhan-Hu-1
+        return "Yes"
+    elif row['genome_assembly_id'] == 'GCA_001558355.2': #LT2
+        return "Yes"
+    elif row['genome_assembly_id'] == 'GCA_000857325.2': #Marburg
+        return "Yes"
+    elif row['genome_assembly_id'] == 'GCA_003102975.1': #HXB2
+        return "Yes"
+    else:
+        return "No"
+dfinal=dfinal.assign(ref_org=dfinal.apply(ref_org, axis =1))
+
+#Populate selection_notes for all organisms in ARGOS Bioproject
+def selection_notes(row):
+    if row['bioproject'] == 'PRJNA231221':
+        return "Belongs to FDA-ARGOS PRJNA231221."
+    elif row['bioproject'] == 'PRJNA726840':
+        return "Coding-complete Genome sequences for SARS-CoV-2 B.1.1.7 and B.1.351 Variants from Metro Manila, Philippines, outlined in the following paper: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8281087/"
+    elif row['bioproject'] ==  'PRJNA729484':
+        return "Raw sequencing reads were collected for the SARS-CoV-2 P.1 variant in Northeast Brazil, outlined in this paper:  https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8321350/"
+    elif row['bioproject'] ==  'PRJNA791622' and row['lab_name'] == 'Pond Lab':
+        return "For omicron, we are selecting EPI_ISL_6913953. Sequencing was conducted on  Illumina MiSeq, has high coverage, and a consistent quality score across all base calls above 30. Raw reads are available at https://www.ncbi.nlm.nih.gov/sra/SRX13486794, and a full description of the patient harboring the virus is supplied with the following publication https://academic.oup.com/cid/advance-article/doi/10.1093/cid/ciab1072/6494531?login=true. The patient was one of the first two known    COVID-19 cases classified as omicron in Japan. To put the collection date of 28 November 2021 in perspective, the first known omicron sample was collected on 8 November 2021. Raw reads from South Africa are available, but the average phred quality score is much lower for those samples."
+    elif row['bioproject'] ==  'PRJNA791622' and row['lab_name'] == 'HIVE Lab':
+        return "Raw sequencing reads were collected as part of a fusogenicity and pathogenicity study, outlined in the following paper: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8942852/"
+    elif row['bioproject'] ==  'PRJNA603194':
+        return "SARS-CoV-2 original isolate from human lung metagenome from Wuhan."
+    elif row['bioproject'] ==  'PRJEB12890':
+        return "List of SRA IDs retrieved using search string in SRA 'txid211044[Organism:exp].' The SRA ID was selected based on NCBI search filters Source: RNA, Type: genome, Library layout: paired, Platform: Illumina. Project National Collection of Pathogenic Viruses (NCPV) UK sequences well-characterised, authenticated human pathogenic viruses."
+
+dfinal=dfinal.assign(selection_notes=dfinal.apply(selection_notes, axis=1))
+dfinal=dfinal.sort_values('organism_name', ascending=True)
+dfinal.to_csv('ngs_id_from_api.tsv', sep = '\t', index = False)
+
+#clean up files
+os.remove('Crandallapi.json')
+os.remove('hiveapi.json')
+#comment out lines below to keep tsv per lab if necessary
+os.remove('hiveapi.tsv')
+os.remove('hiveTEMP.tsv')
+os.remove('Crandallapi.tsv')
+os.remove('CrandallTEMP.tsv')