|
14 | 14 | "tags": [] |
15 | 15 | }, |
16 | 16 | "source": [ |
17 | | - "# Converts a CSV file with header to parquet using ApacheSpark" |
| 17 | + "# spark-csv-to-parquet" |
| 18 | + ] |
| 19 | + }, |
| 20 | + { |
| 21 | + "cell_type": "markdown", |
| 22 | + "id": "63d16770-79ac-4255-a2d0-29412418bebf", |
| 23 | + "metadata": {}, |
| 24 | + "source": [ |
| 25 | + "Converts a CSV file with header to parquet using ApacheSpark" |
18 | 26 | ] |
19 | 27 | }, |
20 | 28 | { |
|
46 | 54 | "fi" |
47 | 55 | ] |
48 | 56 | }, |
49 | | - { |
50 | | - "cell_type": "code", |
51 | | - "execution_count": null, |
52 | | - "id": "norman-memory", |
53 | | - "metadata": { |
54 | | - "papermill": { |
55 | | - "duration": 0.023625, |
56 | | - "end_time": "2021-03-22T20:29:25.776130", |
57 | | - "exception": false, |
58 | | - "start_time": "2021-03-22T20:29:25.752505", |
59 | | - "status": "completed" |
60 | | - }, |
61 | | - "tags": [] |
62 | | - }, |
63 | | - "outputs": [], |
64 | | - "source": [ |
65 | | - "# @param data_dir temporal data storage for local execution\n", |
66 | | - "# @param data_csv csv path and file name (default: data.csv)\n", |
67 | | - "# @param data_parquet path and parquet file name (default: data.parquet)\n", |
68 | | - "# @param master url of master (default: local mode)" |
69 | | - ] |
70 | | - }, |
71 | 57 | { |
72 | 58 | "cell_type": "code", |
73 | 59 | "execution_count": null, |
|
107 | 93 | }, |
108 | 94 | "outputs": [], |
109 | 95 | "source": [ |
| 96 | + "# source path and file name (default: data.csv)\n", |
110 | 97 | "data_csv = os.environ.get('data_csv', 'data.csv')\n", |
| 98 | + "\n", |
| 99 | + "# destination path and parquet file name (default: data.parquet)\n", |
111 | 100 | "data_parquet = os.environ.get('data_parquet', 'data.parquet')\n", |
| 101 | + "\n", |
| 102 | + "# url of master (default: local mode)\n", |
112 | 103 | "master = os.environ.get('master', \"local[*]\")\n", |
| 104 | + "\n", |
| 105 | + "# temporal data storage for local execution\n", |
113 | 106 | "data_dir = os.environ.get('data_dir', '../../data/')" |
114 | 107 | ] |
115 | 108 | }, |
|
122 | 115 | "source": [ |
123 | 116 | "# override parameters received from a potential call using %run magic\n", |
124 | 117 | "parameters = list(\n", |
125 | | - " map(\n", |
126 | | - " lambda s: re.sub('$', '\"', s),\n", |
127 | | - " map(\n", |
128 | | - " lambda s: s.replace('=', '=\"'),\n", |
129 | | - " filter(\n", |
130 | | - " lambda s: s.find('=') > -1,\n", |
131 | | - " sys.argv\n", |
132 | | - " )\n", |
133 | | - " )\n", |
134 | | - " )\n", |
| 118 | + " map(\n", |
| 119 | + " lambda s: re.sub('$', '\"', s),\n", |
| 120 | + " map(\n", |
| 121 | + " lambda s: s.replace('=', '=\"'),\n", |
| 122 | + " filter(\n", |
| 123 | + " lambda s: s.find('=') > -1 and bool(re.match('[A-Za-z0-9_]*=[.\\/A-Za-z0-9]*', s)),\n", |
| 124 | + " sys.argv\n", |
| 125 | + " )\n", |
| 126 | + " )\n", |
| 127 | + " )\n", |
135 | 128 | ")\n", |
136 | 129 | "\n", |
137 | 130 | "for parameter in parameters:\n", |
| 131 | + " logging.warning('Parameter: '+parameter) \n", |
138 | 132 | " exec(parameter)" |
139 | 133 | ] |
140 | 134 | }, |
|
0 commit comments