Skip to content

Commit 5e79ecf

Browse files
author
Romeo Kienzer
committed
add codenet input
1 parent 70d1373 commit 5e79ecf

File tree

1 file changed

+197
-0
lines changed

1 file changed

+197
-0
lines changed
Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"id": "228bfca6-72ee-4656-b7c4-7694ab085518",
6+
"metadata": {},
7+
"source": [
8+
"# Pulls Codenet classification data from the ml-exchange.org"
9+
]
10+
},
11+
{
12+
"cell_type": "markdown",
13+
"id": "excess-grass",
14+
"metadata": {
15+
"papermill": {
16+
"duration": 0.022097,
17+
"end_time": "2021-04-06T15:33:53.837895",
18+
"exception": false,
19+
"start_time": "2021-04-06T15:33:53.815798",
20+
"status": "completed"
21+
},
22+
"tags": []
23+
},
24+
"source": [
25+
"Pulls Codenet classification data.zip from the ml-exchange.org in a form ready for text classification in the folowing format:\n",
26+
"\n",
27+
"zip_root/data/train/*language_name*/**code_sample_file_id** \n",
28+
"zip_root/data/test/*language_nam*/**code_sample_file_id**"
29+
]
30+
},
31+
{
32+
"cell_type": "code",
33+
"execution_count": null,
34+
"id": "assigned-lottery",
35+
"metadata": {
36+
"papermill": {
37+
"duration": 6.449538,
38+
"end_time": "2021-04-06T15:34:00.298216",
39+
"exception": false,
40+
"start_time": "2021-04-06T15:33:53.848678",
41+
"status": "completed"
42+
},
43+
"tags": []
44+
},
45+
"outputs": [],
46+
"source": [
47+
"!pip3 install wget==3.2"
48+
]
49+
},
50+
{
51+
"cell_type": "code",
52+
"execution_count": null,
53+
"id": "impaired-sharing",
54+
"metadata": {
55+
"papermill": {
56+
"duration": 0.270152,
57+
"end_time": "2021-04-06T15:34:00.631600",
58+
"exception": false,
59+
"start_time": "2021-04-06T15:34:00.361448",
60+
"status": "completed"
61+
},
62+
"tags": []
63+
},
64+
"outputs": [],
65+
"source": [
66+
"import wget\n",
67+
"import logging\n",
68+
"import os\n",
69+
"import re\n",
70+
"import shutil\n",
71+
"import sys\n",
72+
"import tarfile"
73+
]
74+
},
75+
{
76+
"cell_type": "code",
77+
"execution_count": null,
78+
"id": "local-gather",
79+
"metadata": {
80+
"papermill": {
81+
"duration": 0.023729,
82+
"end_time": "2021-04-06T15:34:01.510228",
83+
"exception": false,
84+
"start_time": "2021-04-06T15:34:01.486499",
85+
"status": "completed"
86+
},
87+
"tags": []
88+
},
89+
"outputs": [],
90+
"source": [
91+
"# file name for training data zip\n",
92+
"output_filename = os.environ.get('output_filename', 'data.zip')\n",
93+
"\n",
94+
"# temporal data storage for local execution\n",
95+
"data_dir = os.environ.get('data_dir', '../../data/')"
96+
]
97+
},
98+
{
99+
"cell_type": "code",
100+
"execution_count": null,
101+
"id": "a1cd43db-f550-4bc9-8353-e73bbf1e39ad",
102+
"metadata": {},
103+
"outputs": [],
104+
"source": [
105+
"parameters = list(\n",
106+
" map(\n",
107+
" lambda s: re.sub('$', '\"', s),\n",
108+
" map(\n",
109+
" lambda s: s.replace('=', '=\"'),\n",
110+
" filter(\n",
111+
" lambda s: s.find('=') > -1 and bool(re.match('[A-Za-z0-9_]*=[.\\/A-Za-z0-9]*', s)),\n",
112+
" sys.argv\n",
113+
" )\n",
114+
" )\n",
115+
" )\n",
116+
")\n",
117+
"\n",
118+
"for parameter in parameters:\n",
119+
" logging.warning('Parameter: '+parameter) \n",
120+
" exec(parameter)"
121+
]
122+
},
123+
{
124+
"cell_type": "code",
125+
"execution_count": null,
126+
"id": "widespread-ghana",
127+
"metadata": {
128+
"papermill": {
129+
"duration": 0.021484,
130+
"end_time": "2021-04-06T15:34:01.546626",
131+
"exception": false,
132+
"start_time": "2021-04-06T15:34:01.525142",
133+
"status": "completed"
134+
},
135+
"tags": []
136+
},
137+
"outputs": [],
138+
"source": [
139+
"wget.download('https://dax-cdn.cdn.appdomain.cloud/dax-project-codenet/1.0.0/Project_CodeNet_LangClass.tar.gz')"
140+
]
141+
},
142+
{
143+
"cell_type": "code",
144+
"execution_count": null,
145+
"id": "8e5949af-eeed-46b4-bfe6-e16f0ba52a8d",
146+
"metadata": {},
147+
"outputs": [],
148+
"source": [
149+
"with tarfile.open('Project_CodeNet_LangClass.tar.gz') as tf:\n",
150+
" tf.extractall()"
151+
]
152+
},
153+
{
154+
"cell_type": "code",
155+
"execution_count": null,
156+
"id": "f366a1f7-3b7b-4366-b39d-8361d7b90e0d",
157+
"metadata": {},
158+
"outputs": [],
159+
"source": [
160+
"shutil.make_archive(data_dir + output_filename.split('.zip')[0], 'zip', 'data')"
161+
]
162+
}
163+
],
164+
"metadata": {
165+
"kernelspec": {
166+
"display_name": "Python 3 (ipykernel)",
167+
"language": "python",
168+
"name": "python3"
169+
},
170+
"language_info": {
171+
"codemirror_mode": {
172+
"name": "ipython",
173+
"version": 3
174+
},
175+
"file_extension": ".py",
176+
"mimetype": "text/x-python",
177+
"name": "python",
178+
"nbconvert_exporter": "python",
179+
"pygments_lexer": "ipython3",
180+
"version": "3.8.10"
181+
},
182+
"papermill": {
183+
"default_parameters": {},
184+
"duration": 17.49536,
185+
"end_time": "2021-04-06T15:34:10.489088",
186+
"environment_variables": {},
187+
"exception": null,
188+
"input_path": "/home/jovyan/work/examples/pipelines/pairs/component-library/input/input-climate-copernicus.ipynb",
189+
"output_path": "/home/jovyan/work/examples/pipelines/pairs/component-library/input/input-climate-copernicus.ipynb",
190+
"parameters": {},
191+
"start_time": "2021-04-06T15:33:52.993728",
192+
"version": "2.3.3"
193+
}
194+
},
195+
"nbformat": 4,
196+
"nbformat_minor": 5
197+
}

0 commit comments

Comments
 (0)