ãBigQueryãã¯ã¨ãªã®åä½ãã¹ããæ¸ããã¨æã£ããã©å£ãåãã¦ã©ãããããæ©ãã§ãã話
ããã«ã¡ã¯ã
ä¹ ãã¶ãã®ããã°æ稿ã«ãªã£ã¦ãã¾ã£ãã®ã§ãããä»æ¥ã¯ãã¯ã¨ãªã®åä½ãã¹ããæ¸ããã¨æã£ããã©ãå£ãåãã¦ã©ããããããªã¨æã£ã話ããæ¸ãããã¨æãã¾ãã
ã¯ããã«è¨ã£ã¦ããã¾ããããã®è¨äºã¯ç¹ã«ä½ã解決çãããããã§ããªãã§ããªããããã ãã¯ã¨ãªã®åä½ãã¹ããæ¸ããã¨ããã¨ãããªåé¡ã«ã¶ã¡å½ãã£ã¦ãããã«å¯¾ãã¦ãããããããã¨ã¯æã£ããã©ã誰ãæé«ã®ã½ãªã¥ã¼ã·ã§ã³ãªãï¼ç¬ãã£ã¦èãããã£ãããæ¸ããã ãã®è¨äºã§ãç¬*1
ãã®è¨äºã®ã¢ãã
æè¿ãBigQueryã使ã£ããã¼ã¿åæåºç¤ã¨ãããéçºãã¦ããã®ã§ãããããã¹ãæ¸ãã¦ãªãã¨ããåãã @t_wada ããã®åã§ãåããã¨è¨ããã®ï¼ãã£ã¦è¨ãç¶æ ã«ãªãããããçé¢ç®ã«ãããã¨ããã¨ã©ããªããã ãã£ã¦è¨ããã¨ã§çé¢ç®ã«å¯¾å³ãã¦ã¿ã¾ãã*2ã
ããã§è¨ããçé¢ç®ã«ãã¨è¨ãã®ã¯ããå ¥ãå¾ãå ¨ãã¿ã¼ã³ã®ãã¼ã¿ã«å¯¾ããéè¨ããã£ã¦ãããç¶æ ãæãã¾ãã
èªåãAPIã®åä½ãã¹ããæ¸ãããã¨ãã¯æ®éã«ãã£ããã¨ãããã®ã§ããããã¼ã¿éè¨ç³»ã¨ãæ©æ¢°å¦ç¿ç³»ã¨ãã®ãã¹ãã£ã¦ãã¾ããªãã¨ãããããã¿ã¾ãããæéãªãã£ããã£ã¦ãã¦ãã¾ãã*3ãæ¸ããã¨ãã¦ããä¸éå端ãªããããã ãã¯ãã£ã¦ãããã£ã¦è¨ãã¬ãã«ã§ãã
ã§ãããã£ã±ãããã ãã¯å質ãè½ã¨ãã¡ãè¡ããã*4ãã£ã¦è¨ãé¨åããã£ããå®ããããªã¨ããç¶æ³ã«åºãããã¾ãã¦ãã¾ããããã¨ãªã£ãããã§ãã
ã§ãä»åã¯è²ã æ¤è¨ããçµæããããã£ã¦ãã¡ããã¡ã大å¤ã ãã©ãã¸ã§ã©ãããã®ï¼ãã£ã¦ããã¨ããã«ã¤ãã¦æ¸ãã¦ãããã¨æãã¾ãã
åæ
ç°å¢
ç°å¢ã¨ãã¦ã¯ãBigQueryãä¸å¿ã«è©±ãã¦ããã¾ãããããæ¹ã¯éãã©èãæ¹ã¯åãã«ãªãã®ã§ã¯ãªãããªã¨æãã¾ãã
- ãã¼ã¿ã¦ã§ã¢ãã¦ã¹ï¼BigQuery
- ã¯ã¼ã¯ããã¼ï¼Airflow (Cloud Composer)
BigQueryã¯ã¨ãã¥ã¬ã¼ã¿ã¼ãåå¨ããªãã®ã§ããã¼ã«ã«ãªã©ã§ãã¹ããå®è¡ã§ãã¾ããã®ã§ãå®éã«ãBigQueryã«ãã¹ããã¼ã¿ãç¨æããã¯ã¨ãªãå®è¡ãããå¿ è¦ãããã¾ãã
ã¯ã¨ãªã®åä½ãã¹ãã§ãããã¨
BigQueryã«ã¯ã¨ãã¥ã¬ã¼ã¿ã¼ãåå¨ããªããããã¯ã¨ãªã®åä½ãã¹ããæ¸ããã¨æã£ãæã¯ã以ä¸ã®ãããªä½æ¥æé ã«ãªãã¨æãã¾ãã
- åä½ãã¹ãã®ããã«å¿ è¦ãªãã¹ããã¼ã¿ãç¨æãã
- ãã¹ããã¼ã¿ãä¸æçã«BigQueryä¸ã«ãã¼ãã«å
- ãã¹ããã¼ã¿ãå ¥ã£ããã¼ãã«ã«å¯¾ãã¦ãã¹ãã®ã¸ã§ããBigQueryä¸ã§å®è¡
- ãã¹ãã®ã¸ã§ãã®çµæããæ£è§£ã®ãã¼ã¿ã¨ä¸ç·ããå¤æãã
- å¦çãçµãã£ãããä½æãããã¼ãã«ãåé¤ãã
å®éã®å¦çã®å®è£
æ¹æ³ã«ã¤ãã¦ã¯ããã§ã¯ç´°ããè¨åãã¾ãããã大ã¾ããªããã¼ã¯ãã®ããã«ãªãããªã¨æãã¾ãã
ç«ã¡ã¯ã ãã3ã¤ã®å£
ä¸è¨ã®ãããªããã¼ãæ³å®ããæ©éãã¹ããæ¸ãã¦ãããã¨æã£ããã§ããã3ã¤ã®å£ã«ä¼ãã¾ããã
- ãã¼ã¿ãç¶²ç¾ çã«ç¨æããã®ãé常ã«ããã©ããã
- ã¯ã¨ãªããã¹ãããããããã«æ¸ããã¦ããªã
- ã¯ã¨ãªã®å¤æ´ã«ããç¨åº¦ãã¹ãã®å¦çãèãããããã«ãã
ããããã«ã¤ãã¦æ¸ãã¦ãããã¨æãã¾ãã
å£â ï¼ãã¼ã¿ãç¶²ç¾ çã«ç¨æããã®ãé常ã«ããã©ããã
ã¯ã¨ãªã¯ããåä½ã§ã¯åããªãã®ã§ããã¼ã¿ãç¨æããå¿ è¦ãããã¾ããè¾ãããªãã£ã¦æã£ã¦ãããã§ãããæ³å以ä¸ã§ããã
å¤ã®ç¯å²ãåºããã
å½ããåã§ãããBigQueryã«ãåãããã¾ãã以ä¸ã®ãããªæãã§ãã
- STRING
- TIMESTAMP
- INT64
- FLOAT64
æåã¯ããã®å¤ããã¼ã¹ã«ãã¦ããFaker.jsã¨ããå©ç¨ãã¦ããã¼ãã¼ã¿çæããã°ããããããã£ã¦æã£ã¦ãã¾ããã
ããããããã§åé¡ãçºçãã¾ããä¾ãã°ã次ã®ãããªã¯ã¨ãªã§ãã
SELECT SAFE_CAST(revenue AS FLOAT64) AS revenue -- revenueã¯STRINGã§ã FROM hogehoge
ãªãã¨è¨ããã¨ã§ããã*5ããããã¡ãªãã¨ããããStringã§å ¥ãã¦ãå¾ã§ãªãã¨ãããããä½æ¦ã§ããã¾ãããã¯ä¸æ¦ã«ééã£ã¦ããªãã¦ãæ¬å½ã«STRINGãæ¥ããã¨ãããããã§ãã
ãã ããSTRINGã ãªãã¨æã£ã¦ããã¼ã®ãã¼ã¿ãçæãã¦ããã¡ã§ãINTãFLOAT(ãã¤ãã¹ãããã¼ãããã©ã¹ã¾ã§)ã®å¤ãSTRINGã®é¡ããã¦å ¥ãã¦ãããªãã¨è¨ããã¨ã§ããSTRINGãããã¨è¨ããã¨ã¯ã空ç½æååãæ³å®ããªãã¨ãããªãããã¨ããããã¾ãã
ãªã®ã§ãæ®éã«æååã ããæååã ãçæãã¦ããã¡ã ããéã«æ°å¤ããæ¥ãªãããã£ã¦ãããæ¬ã£ã¦ã¯ãããªãã¿ãããªæãã§ããè¾ãã
ãã¼ã¿ãç¨æããã©ã¤ãã©ãªããªã
ã¾ããä½ããã£ã¦è¨ããããããã¾ã§ãªãã§ãããæå¤ã«ããã©ããããæ¢ãã¦ã¿ããã©ãã俺ã欲ãããã®ã¯ãªãã£ããã¨è¨ãæãã§ããåä½ãã¹ãããããªãããå°ãè¸ãå¼µã£ã¦æ¸ããã°ãã£ã¦è¨ãæ°æã¡ã«ãªãã¾ããã
åã«æ²¿ã£ã¦ãã¼ã¿ãçæãããã®ã¯ãã£ãã®ã§ããã欲ããã£ãã®ã¯ã
- ç¨æããããã¼ã¿ã¯æ±ºã¾ã£ãã«ãã´ãªã¼ã®å¤ããå«ã¾ããªãSTRINGã ã£ããã
- ã¼ã以ä¸ã®æ´æ°ã ãã¨ãã
ããè¨ã風ã«ããæãã§BigQueryã®ã¹ãã¼ãã«ãã©ã¹Î±ã§æå®ãããããæãã§ãã¼ã¿ãçæãã¦ããããã®ã§ãã
ã«ã©ã æ°ãå°ãªãå ´åã¨ãã¯ã¾ãæåã§ããªãã¨ããªãããã§ããããããããªãã¨çµæ§ããã©ãããããªãã£ã¦ãªãã¾ãã
å£â¡ï¼ã¯ã¨ãªããã¹ãããããããã«æ¸ããã¦ããªã
ããã§ã¯å®è¡ã®é度ã¨ãã¯ä¸æ¦ç¡è¦ãã¦è°è«ãã¾ãããã¯ã¨ãªã®ãã¸ãã¯ããã¡ãã¨åããã¦ããã(withå¥ã§åããã¨ããããããdatalake/dwh/datamartã®ã¬ã¤ã¤ã¼ã§åããã¦ããªãã¨ã)ããã¹ãã®ããã«ç¨æãããã¼ã¿ãå¤ã«è¤éåããããè¦éããããããã¨ãããã¾ãã
å ç¨ã®SAFE_CASTã¨ããããã§ãããä¸çªæåã®ãã¼ã¿ãåå¾ããã¬ã¤ã¤ã¼ã¨ãã§åã®å¤æå¦çãã§ããéããã£ã¦ããã¨ãããã®å¤æãåãã¦ããã¨ãã ãã§ãããã©ã®ãããªãã¼ã¿ãæ¥ããã¨ãæå¾ ãã¦ããããããããããããªããå¾ã«å¯¾å¦æ¹æ³ã§ã話ãããã¼ã¿ã¨ã¯ã¨ãªã®ãã¹ããåãããã®ã楽ã«ãªãã¾ãã
å®éã®ã¯ã¨ãªã¯ãããªã£ã¦ããªããã¨ã®æ¹ãå¤ãã®ã§ãã¾ãã¯å¦çã®ããã¼ãè¦éãããããæ¹ããçæããæãã«ãªãã¾ãã
å£â¢ï¼ã¯ã¨ãªã®å¤æ´ã«ããç¨åº¦ãã¹ãã®å¦çãèãããããã«ãã
ããã¯ãå£â¡ãã®è©±ã«ãã¤ãªããã®ã§ããããã¹ãã®å¦çã®é½åä¸ãComposerãªã©ã§å®éã«å¦çã®ä¸ã§å©ããã¼ãã«ã¨ã¯éããã®ã«ãªãã¾ãã
ãªã®ã§ãããã°ã©ã ä¸ã§å¤æããã¦ãããå¿ è¦ããããããªã®ã§ãããã¾ããã¼ãã£ã·ã§ã³ãã¼ãã«ã ã£ãããããããªãã£ãããç°å¢ãã¨ã«å¤ãå¤ããããã«ãªã£ã¦ãããã¨ãããããã§ããããã辺ãã£ã¡ãèãããããã«ãæåããã¯ã¨ãªãæ¸ããã¦ããã°OKãªã®ã§ããããããããªãã£ããããã®ã§ç¾å®ã¯å¤§å¤ã§ãæ±ã
対å¦æ¹æ³
ã£ã¦ãªããã§ãã¯ã¨ãªã®åä½ãã¹ããæ¸ãã¨ãã®å£ãæ¸ãã¦ããã®ã§ãããããã«å¯¾ãã¦ã©ã®ãããªå¯¾å¦æ¹æ³ããã¦ããã®ãè¯ãããããèãã¦ã¿ã¾ããã
â ï¼ããããå ¥å£ãã¡ããã¨ãã
å ãåããªã話ã§ãããå¾è¿°ãã対å¦æ¹æ³ãèããã¨ããdatalakeãã®ã¬ã¤ã¤ã¼ã«ãããã¼ã¿ãããæãã§ããã°ããã»ã©æé«ãªããã§ãã
ãããªã±ã¼ã¹ã®æ¹ãå°ãªãã¨æãã®ã§ãä¸æ¦ããã¾ã§ã«ãã¦ããã¾ããããã¸ã§ãããå¹ãã¦ãããã ãªãã¨è¨ãæ°æã¡ã§ãã
â¡ï¼å¤ããã£ã«ã¿ã¼ããå¦çã¨ã¯ã¨ãªã®éè¨ãã¸ãã¯ãåãã
ç°å¸¸å¤ãå ¥ã£ã¦ããå¯è½æ§ããããã¨ãæ³å®ããã®ã¯è¯ããã¨ã§ãããããã¨éè¨ãã¸ãã¯ããã£ã¡ãã«ããã¨ããã¹ããããä¸ã§ãã¾ãåãåããã§ããªããªãã¾ãã
ãªã®ã§ããå¤ããã£ã«ã¿ã¼ããå¦çãã¨ãã¯ã¨ãªã®éè¨ãã¸ãã¯ããã§ããéããã¼ãã«ã§ãã£ãããåãã¯ã¨ãªå ã§ãã£ãã¨ãã¦ãããããããåããããããã¨ã§ããéè¨ãã¸ãã¯ã«å¯¾ãã¦ã®ãã¹ãã¯ããç¨åº¦æå¾ ããããã¼ã¿ã®ã¿ã¨ãããã¨è¨ãããã«ã§ãã¾ã*6ã
以ä¸ã§è¿°ã¹ãæå¹ç¯å²ã®ãã¹ãã¨ã®åãåããè¦éããè¯ããªãããã¹ãé²ãããããªãã¾ãã
â¢ï¼å¤ã®æå¹ç¯å²ã®ãã¹ãã¨ã¯ã¨ãªã®éè¨ã®ãã¹ããåãã
ãããå½ããåã ãã£ã¦æ°æã¡ããã§ãããå¤ã¨ãã¦ã¯å ¥ã£ã¦ããå¯è½æ§ãã¼ãã¨ã¯è¨ããªãã±ã¼ã¹(ã¢ããªã±ã¼ã·ã§ã³ã®ä»æ§ãå¤ãã£ãããã¹ã£ã¦ãã¼ã¿ãéã£ã¦ãã¾ã£ããç)ã¯ããã¨ãã¦ãããã®å¤ãæ¥ããã¨ã«æ°ã¥ããã¹ããã¨ãã¯ã¨ãªã®éè¨ãæ£ãããã®ãã¹ãããåãããã¨ã§ãç°¡åã«ãªãã±ã¼ã¹ãããã¾ãã
ä¾ãã°ã以ä¸ã®ããã«ãã¹ããåããã¨ãã§ãã
- å¤ã®æå¹ç¯å²ã®ãã¹ãï¼ã¦ãã¼ã¯å¶ç´ãEnumåã®ãã§ãã¯ãªã©
- ã¯ã¨ãªã®éè¨ã®ãã¹ãï¼ãã¸ãã¯é¨åã«ã®ã¿ãã©ã¼ã«ã¹ãå½ã¦ããã§ãã¯
ãã®åãåãããããã¨ã§ãã¯ã¨ãªã®éè¨ã®ãã¹ãã®é¨åã§ã¯ããã¾ãã¾ãªå¤ããããã¨ãæ³å®ãããã¹ããããªãã¦è¯ããªãããã¼ã¿ãç¨æããæéãçãã¾ãã
ã¯ã¨ãªã®éè¨ã®ãã¹ãã ãã§ã大å¤ã§ãããããç¨åº¦æ£å¸¸ç³»ã«è½ã¨ãè¾¼ããã¨ãã§ããã®ã§ãããã¯ç¾å®çã«ã¯æå¹ãªæ段ã¨æãã¾ãã
ã¾ã¨ã
ãã¹ããã©ã¯ãã£ã¹ã¯ã¾ã ããåãã£ã¦ã¾ããããããç¥ã£ã¦ããã£ãããæ¹ã¨ããããããã²æãã¦æ¬²ããã§ãï¼
*1:ã©ãªããããç¥æµããã ããç¬
*2:ä»æ´
*3:æ£ç´è
*4:ã¾ãå ¨é¨è½ã¨ãã¡ããããªããã§ããã©ãåªå 度ã£ã¦ãã¤ã§ããã
*5:ããã©ã¼ã¢ãã¿ã¼
*6:ãã¡ãããããã§ããã®ãã¨è¨ãã®ã¯ããã¤ã¤ãç¾å®åé¡ã³ã¹ããè¯ããªãã®ã¯äºå®ã ã¨æãã¾ã
ãAirflowãKubernetesPodOperatorã«dag_run.confã渡ããã
ããã«ã¡ã¯ã
ä»æ¥ã¯ãairflowã¨æ¯ãã¦ãããããªãããã°ãè¸ãã ãä½ããããã®ã§ãããã«ã¤ãã¦æ¸ãããã¨æãã¾ãã
ãããããã¨
KubernetesPodOperatorã«dag_run.confãenv_varsã®ãã©ã¡ã¼ã¿ã¼çµç±ã§æ¸¡ãã¦ããã®env_varsã«ãå®è¡æ¥ã渡ãã¦ãã©ã®æ¥ã®å®è¡ãè¡ãããããç°å¢å¤æ°çµç±ã§dockerã³ã³ããã®æ¹ã«ä¼ããããã¨ãä»åãããããã¨ã§ãã
ããã§ãå®è¡æ¥ãUIããTrigger Dagãããã¨ãã«æ¸¡ãããã«ã dag_run.conf
çµç±ã§æ¸¡ãããã¨ãã¾ããããã©ã«ãã§ã¯å®è¡ããæå»ã§ãã
ããã£ããã¨
airflow2.0ãä»Cloud Composerçµç±ã§ä½¿ã£ã¦ããã®ã§ãããairflowã®ãã¼ã¸ã§ã³1ã¨å¤ãã£ãããã§ããã¾ãtemplateãåãã¦ããã¾ããã§ãããå ·ä½çã«ã¯ãæå以ä¸ã®ãããªã³ã¼ããæ¸ãã¦ãã¾ããã
kubernetes_pod_operator.KubernetesPodOperator( task_id='hogehoge_task_id', name='hogehoge_name', cmds=['run'], env_vars={'ENV': env, 'EXECUTE_DATE': '{{ dag_run.conf["ymd"] if dag_run.conf["ymd"] else ds }}'}, secrets=[secret_volume], namespace='default', startup_timeout_seconds=60 * 30, is_delete_operator_pod=True, image_pull_policy='Always', image='hogehoge')
templated(jinjaã®ãã³ãã¬ã¼ããèªã¿åã£ã¦å¦çãã¦ããã)ãªãã©ã¡ã¼ã¿ã¼ãªã¯ããªã®ã«ä½æ ãããããæååã¨ãã¦æ¸¡ããã¦ãã¦ãæ¥ä»ãã¼ã¿ãããªãããã¿ãããªã¨ã©ã¼ãåºã¦ãã¾ã£ã¦ãã¾ããããªãã§ã ããã¨æã£ã¦èª¿æ»ãã¦ãã®ã§ãããæå¤ã«ããã£ã¦ãã¾ã£ãã®ã§ãåå¿é²ãæ®ããã¨æã£ã次第ã§ãã
調æ»ãããã¨
ç´æ¥çãªåå ã¯ããããªãã£ãã®ã§ãããKubernetesPodOperatorã®ã¯ã©ã¹ãè¦ãã¨ãenv_varsãå¤æãã¦ããé¨åã®ã³ã¼ãã以ä¸ã®ããã«ããã
ãã®å¤æã®é¢æ°ãã以ä¸ã«ãããã¨ããããã¾ããã
å
·ä½çã«ã¯ã以ä¸ã®ã³ã¼ããªã®ã§ããããããè¦ãã¨ãdictã®æ㯠k8s.V1EnvVar
ã£ã¦ãã¤ã«ãã¦ãããã¨ããããã¾ãã
def convert_env_vars(env_vars) -> List[k8s.V1EnvVar]: """ Converts a dictionary into a list of env_vars :param env_vars: :return: """ if isinstance(env_vars, dict): res = [] for k, v in env_vars.items(): res.append(k8s.V1EnvVar(name=k, value=v)) return res elif isinstance(env_vars, list): return env_vars else: raise AirflowException(f"Expected dict or list, got {type(env_vars)}")
dictã§æ¸¡ãã¨ãjinjaã®ãã³ãã¬ã¼ãããã¾ãå¤æãããã«ããã®ã¾ã¾k8s.V1EnvVar
ã®æ¹ã«å¤æããã¦ãã¾ã£ã¦ããæ§åã§ããããªã®ã§ãlistå½¢å¼ã«æçµçã«ãã¦ãããå¿
è¦ãããã®ã§ãããããã§k8s.V1EnvVar
ã使ãã¨ããããæååã¨ã¿ãªãã¦ãã¾ããããã¡ã§ããã
ä»æ¹ãªãã®ã§ãV1EnvVarã®å®ç¾©ãè¦ã¦ãããã®å½¢ã«ãªãããã«èªåã§æ¸¡ãã¦ãããã¨ããã¾ãããã¾ããã
æçµçãªã³ã¼ã
ã¨ãããã¨ã§ã以ä¸ã®ãããªæãã§æ¸¡ãã¨ãã¾ãããã¾ããã
kubernetes_pod_operator.KubernetesPodOperator( task_id='hogehoge_task_id', name='hogehoge_name', cmds=['run'], env_vars=[ {'name': 'ENV', 'value':'hogehoge'}, {'name': 'EXECUTE_DATE', 'value': '{{ dag_run.conf["ymd"] if dag_run.conf["ymd"] else ds }}'} ], secrets=[secret_volume], namespace='default', startup_timeout_seconds=60 * 30, is_delete_operator_pod=True, image_pull_policy='Always', image='hogehoge' )
ãã¼ãè¬ããã
ãPandasãé±ã®ææ¥ã®å§ã¾ããéãå ´åã®é±ãã¨ã®æ¥ä»ãåå¾ããæ¹æ³ããã£ãã®ã§ã¾ã¨ãã
ããã«ã¡ã¯ã
ã¢ããã¼ã·ã§ã³
ä»éçºã§ãé±ã®ææ¥ã®å§ã¾ããéãã±ã¼ã¹ããã£ã¦ãã§ãä¸å¹´éã®é±ã®éãçªå·ãã¨ã«éè¨ãããããã¿ãããªæãã®ãã¨ãããããé±ã®ææ¥å§ã¾ããéãææ¥ã®ã¹ã¿ã¼ãã§ãé±ãã¨ã®æ¥ä»ãåå¾ããããªã¨ããæ°æã¡ãããã¾ããã
pandasã§æ®éã«é±ãã¨ã®æ¥ä»ãåå¾ãããå ´åã¯ãdate_rangeã®ã¡ã½ããã使ã£ã¦ã
import pandas as pd pd.date_range(start='2021-01-01', end='2022-01-01', freq='W')
ã¨ããã°ãããªã£ã¦æã£ã¦ãã¾ããããããã®é±ã®ææ¥ã®ã¹ã¿ã¼ããå¤ãããå ´åã¯ãã©ãããããããã ããã¨ããã®ã解æ¶ããã®ããã®è¨äºã®ã¢ããã¼ã·ã§ã³ã§ãã
çµè«
freqã®ãã©ã¡ã¼ã¿ã¼ã«ãè¯ããã©ã¡ã¼ã¿ã¼ãããã¾ãã¦ãããã追å ããã ãã§ããä¾ãã°ãæææ¥ã¹ã¿ã¼ãã«ãããå ´åã¯ã以ä¸ã®ããã«ãªãã¾ãã
import pandas as pd pd.date_range(start='2021-01-01', end='2022-01-01', freq='W-MON')
ç°¡åããããããã
ä»ã®ææ¥ã¯ã©ãããã®
ã¨ãããã¨ã§ãfreqã®ãã©ã¡ã¼ã¿ã¼ã®ãªã¹ããä½ãã¾ããã
éå§ææ¥ | freqã®ãã©ã¡ã¼ã¿ã¼ |
æ¥ææ¥ | W-SUN |
æææ¥ | W-MON |
ç«ææ¥ | W-TUE |
æ°´ææ¥ | W-WED |
æ¨ææ¥ | W-THU |
éææ¥ | W-FRI |
åææ¥ | W-SAT |
以ä¸ã®ãµã¤ãã«å
¬å¼ã®ããã¥ã¡ã³ããããã¾ãã
ããã§ã¯ã
ãPythonãPandasã§cross joinãããæ¹æ³
ããã«ã¡ã¯ã
ä»æ¥ã¯ãpandasã§cross joinãããæ¹æ³ã«ã¤ãã¦æ¸ãããã¨æãã¾ã*1ã
ãããããã¨
df_a, df_bã®äºã¤ã®ãã¼ã¿ãã¬ã¼ã ããã£ãã¨ãã¦ããã®äºã¤ã®ãã¼ã¿ãã¬ã¼ã ãcross joinããããªã¨ããæ°æã¡ã«ãªã£ãã¨ãã¾ãã
ãã ãPandasã«ã¯ããããã®ã¾ã¾ã§ããæ¹æ³ãããã¾ããã
ããæ¹
以ä¸ã®ãããªæãã§å®ç¾ã§ãã¾ãã
def cross_join(df_a, df_b, common_key=None): if common_key is not None: return pd.merge(df_a, df_b, on=common_key, how='outer') df_a['tmp'] = 1 df_b['tmp'] = 1 return pd.merge(df_a, df_b, how='outer') cross_join(df_a, df_b)
åããã¼ãç¨æãã¦ããã¦ããããouterã§joinãããã¨ã§å®ç¾ã§ãã¾ãã
ã¾ã¨ã
ãã¼ããªããç¨æãã¦æ¬²ãããªãã£ã¦æãç¬ãããã§ã¯ã
*1:æ¯å調ã¹ã¡ããã®ã§
èªç¶è¨èªå¦çåãã®ãã¼ã¿ä½æãã¼ã«ã®ãdoccanoãã使ã£ã¦ã¿ãã®ã§ãã¾ã¨ãã
ããã«ã¡ã¯ã
æè¿ãä»äºã§èªç¶è¨èªå¦çé¢ä¿ã®ããã¸ã§ã¯ãããã£ã¦ããã®ã§ããããã®é¢ä¿ã§doccanoã¨ãããã¼ã«ã触ã£ã¦ã¿ããã¨ã«ãªã£ãã®ã§ã使ãæ¹ã¨ããã¾ã¨ãã¦ããã¾ãã
doccanoã¨ã¯
doccanoã¨ã¯ããªã¼ãã³ã½ã¼ã¹ã®ããã¹ãã¢ããã¼ã·ã§ã³ãã¼ã«ã§ãã
以ä¸ã®ä¸ã¤ã®ã¢ããã¼ã·ã§ã³ã¿ã¹ã¯ããããã¨ãã§ãã¾ãã
- Text Classification
- Sequence Labeling
- Sequence to Sequence
demoãµã¤ãã¯ä»¥ä¸ããã
RESTful APIãªã©ãæè¼ããã¦ããã®ã§ãçµæã®åå¾ãAPIçµç±ã§è¡ããªã©ããããã¨ãã§ãã¾ãã
doccanoãã¨ãããããã¼ã«ã«ã§ç«ã¡ä¸ãã¦ã¿ã
doccanoããã¼ã«ã«ã§ã¨ãããã試ãã¦ã¿ã¾ãã
pipã§ã¤ã³ã¹ãã¼ã«ãã¦ç«ã¡ä¸ãã
以ä¸ã®ã³ãã³ãã ãã§ç«ã¡ä¸ãããã¨ãã§ãã¾ãã
$ pip install doccano $ doccano
ããã§ã http://0.0.0.0:8000/
ã«ã¢ã¯ã»ã¹ããã¨ãã¼ã«ãè¦ãã¾ãã
ç°¡åã§ããã
dockerã§ç«ã¡ä¸ãã
dockerã§ç«ã¡ä¸ããã®ãããé£ãããªãã以ä¸ã®ã³ãã³ãã§å¯¾å¿ã§ãã¾ãã
$ docker pull doccano/doccano $ docker container create --name doccano \ -e "ADMIN_USERNAME=admin" \ -e "[email protected]" \ -e "ADMIN_PASSWORD=password" \ -p 8000:8000 doccano/doccano docker container start doccano
ããã§ã http://localhost:8000/
ã«ã¢ã¯ã»ã¹ããã¨ãã¼ã«ãè¦ãã¾ãã
Sequence Labelingã試ãã¦ã¿ã
ä»åã¯ãããã¹ãä¸ã®ãåã人ãã¨ãããã§ãªã人ããèå¥ããã¨ããæå³ä¸æãªã¿ã¹ã¯ãæ³å®ãã¦ããã®è¡¨ç¾é¨åãã¢ããã¼ã·ã§ã³ãã¦ãããã¨ããã£ã¦ã¿ããã¨æãã¾ãã
ãã¼ã«ã«ã§ç«ã¡ä¸ãã¦ãã°ã¤ã³ãã
å ã»ã©ã®æ¹æ³ã§ãã¼ã«ã«ã§ç«ã¡ä¸ãã¾ããããããå ã¯ãAWSãGCPãªã©ã®ç°å¢ã§ç«ã¡ä¸ãã¦ãåãã§ãã
ã¦ã¼ã¶ã¼åã¨ãã¹ã¯ã¼ããå ¥ãã¾ãã
ããã¸ã§ã¯ããä½æãã
å ¥ãã¨ãããã¸ã§ã¯ãã®ä¸è¦§ãè¦ããã¨ãã§ãã¾ããæåã¯ãããã¸ã§ã¯ãããªãã®ã§ãä¸è¦§ã«ã¯ä½ããªãã§ãããä»åã®ç§ã®ä¾ã§ãã¨ãä¸ã¤ããã¸ã§ã¯ããä½ããã¦ãã¾ãã
ä»åã®ã¢ããã¼ã·ã§ã³ã¿ã¹ã¯ã«é¢ããããã¸ã§ã¯ããç«ã¡ä¸ãã¾ããå·¦ä¸ã®Createãæ¼ãã¨ã以ä¸ã®ãããªã¢ã¼ãã«ãåºãã®ã§ãããã«å ¥åãã¾ãã
ããã¸ã§ã¯ããä½æãã¦ãããã¸ã§ã¯ãã®ãã¼ã¸ã«è¡ãã¨ã以ä¸ã®ãããªç»é¢ãè¦ãã¾ãã
å·¦å´ã«ãã¡ãã¥ã¼ãããã¾ããããããã以ä¸ã®å½¹å²ãããã¾ãã
- Homeï¼åã¹ãããã®ãããã¨ãYoutubeã®åç»ã§å¦ã¹ã
- Datasetï¼Annotationãããã¼ã¿ãImportãExportãããã¨ãã§ããã¾ãAnnotationããã¼ã¿ã«å¯¾ãã¦è¡ããã¨ãã§ãã
- Labelsï¼Annotationãããã©ãã«ãç»é²ãããã¨ãã§ãã
- Membersï¼Annotationãããã¡ã³ãã¼ã®ä¸è¦§ã確èªã§ãã
- Guidelineï¼Annotationã®ã¬ã¤ãã©ã¤ã³ãããã«è¨è¼ãããã¨ãã§ãã
- Statisticsï¼Annotationã®å®æ½ç¶æ³ã確èªãããã¨ãã§ãã
ãã¼ã¿ã»ããã®ã¤ã³ãã¼ã
æ©éããã¼ã¿ã»ããã®ã¤ã³ãã¼ãããã¦ã¿ã¾ãããµã¤ãã¡ãã¥ã¼ããã¤ã³ãã¼ãããã¾ãã
以ä¸ã®ãããªã¢ã¼ãã«ãåºã¦ããã®ã§ãã好ããªæ¹æ³ã§ãã¼ã¿ãã¤ã³ãã¼ããã¾ãã
Createã¨æ¼ãã¨ããã¼ã¿ã®åãè¾¼ã¿ãè¡ããã¾ãã
ã©ãã«ãä½æãã
ä»åã®ã¢ããã¼ã·ã§ã³ã§ã¤ããã©ãã«ãä½æãã¾ãã
Createãæ¼ãã¨ã以ä¸ã®ãããªã¢ã¼ãã«ã表示ãããå¿ è¦ãªæ å ±ãè¨å ¥ãã¾ãã
ä»åã¯ãåã人ã¨ãããããªã人ãå½ã¦ãã¿ã¹ã¯ã解ãããã®ã§ã以ä¸ã®ãããªã©ãã«ãä½æãã¾ããã
ããã§æºåå®äºã§ãããã¨ã¯ã²ãããã¢ããã¼ã·ã§ã³ãããã ãã§ãã
Annotatonãéå§ãã
å·¦ä¸ã®ãStart Annotationãããã§ãããã§ããã以ä¸ã®ç»åã«ããããã«Datasetã®ä¸è¦§ããAnnotationããããã®ããã¹ã¿ã¼ããã¦ãããã§ããããªãã§ã大ä¸å¤«ã§ãã
Annotationãã¹ã¿ã¼ããããã¨ã以ä¸ã®ããã«ããã¹ããåºã¦ãã¾ãããã¦ã¹ã§ããã¹ããæ´æ¿¯ããã¨ãå ã»ã©ä½æããã©ãã«ã表示ãããã©ã®ã©ãã«ã«ããããé¸æãããã¨ãã§ãã¾ãã
é¸æããã¨ã以ä¸ã®ããã«ã©ãã«ãããã¹ãã«å¯¾ãã¦ä»ä¸ããã¾ãã
ããã¦ã以ä¸ã®ããã«ã©ãã«ãä»ä¸ããå¾ã«ééããã¨æã£ãããããã¿ã³ãæ¼ããã¨ã§ãæ¶ããã¨ãã§ãã¾ãã
Annotationã®å®æ½ç¶æ³ã確èªãã
Annotationããã®ããã«è¡ã£ã¦ããã®ã§ãããå®æ½ããç¶æ³ã«ã¤ãã¦ã¯ã以ä¸ã®ããã«çµ±è¨æ å ±ã¨ãã¦å¾ããã¨ãã§ãã¾ãã便å©ã§ããã
Annotationããçµæãåãåã
Annotationãçµãã£ããããã¼ã¿ãExportãããã¨ãã§ãã¾ãã以ä¸ã®ç»é¢ãããExportãé¸æãã¦ã
ã¢ã¼ãã«ã«å¿ è¦ãªæ å ±ãå ¥åããã¨Exportã§ãã¾ãã
以ä¸ã§ä¸éãã®æ©è½ã«ã¤ãã¦ã¯ç´¹ä»ãã¾ããã
ææ³ã»ã¾ã¨ã
ä»åã®ããã°ã§ç´¹ä»ãã以å¤ã«ããã·ã§ã¼ãã«ãããã¼ã§æä½ãã§ããããä»ã«ãæ©è½ã¨ãã¦ã¯ããã¤ãããããã
ãã ã
- 1ããã¸ã§ã¯ãã«ã¤ã1MBã¾ã§ãããã¼ã¿ãImportã§ããªãã£ã½ãâ¦ï¼
- Sequence Labelingã§ã©ãã«ä»ä¸ããªãã£ããã®ãå®äºã£ã¦ããã®ã©ããããã ããã¨ããæ°æã¡
ã¨ãããããªæãã§ãã¡ããã¡ããã©ãããããããã ããã£ã¦ããã¨ããã¯ãã£ãã®ã§ã使ããªãã確èªãã¦è¡ããããªã¼ã¨æã£ã¦ãã¾ãã
ããã§ã¯ã
ãkedroãgcsã®ãã¡ã¤ã«ãèªã¿è¾¼ãã¨ãã®credentialsã®è¨å®æ¹æ³
ããã«ã¡ã¯ã
kedroã®ããã¥ã¡ã³ãã«ããæ¸ãã¦ãã£ãããã¿ã¾ãããªã®ã§ãããå°å³ã«ããã£ãã®ã§ãåå¿é²çã«æ¸ãã¦ããã¾ãã
ãããããã¨
gcsã«ãããã¼ã¿ãèªã¿è¾¼ãã§ããããå¦çã®ãªãã§ä½¿ãããã§ãã
è¨å®æ¹æ³
credentialsç³»ã®æ
å ±ã¯ã conf/local/credentials.yaml
ã«è¨å®ããã¨æãã®ã§ãããããã«ä»¥ä¸ã®ããã«æ¸ãã¾ãã
my_gcp_credentials: token: conf/local/my-service-account.json
ãã®tokenã¨è¨ããã¼ã¨dictã«ãã¡ã¤ã«ãæå®ãããã¨ããã®ãã¡ã¤ã«ãã¹ã®æå®æ¹æ³ã¯ããã¤ãéãprojecté ä¸ãã«ã¼ãã§ãããããã®ãã¹ã¨è¨ãæãã§ãã
ãªããªãåºã¦ããªãã¦kedroã®ã³ã¼ãèªãã§é°å²æ°ã§ãã£ã¦ã¿ããåºæ¥ãã®ã§ãæ¸ãã¦ããã¾ãã*1ã
*1:ã¿ããªGCSãããã¼ã¿ãèªã¿è¾¼ãããã£ã¦ãã¨ã¯æ¸ãã¦ããã®ã«ç¬
ã KedroãKedroã«å ¥éããã®ã§ã¾ã¨ãã
ããã«ã¡ã¯ã
æè¿ãKedroã¨è¨ãæ©æ¢°å¦ç¿åãã®ãã¤ãã©ã¤ã³æ§ç¯ç¨ã®ãã¼ã«ã使ã£ã¦ã¿ãã®ã§ãããã«ã¤ãã¦ã¾ã¨ãã¾ãã
Kedroã¨ã¯ï¼
æ¦è¦
Kedro 㯠QuantumBlack ã¨ãããã¼ã¿åæä¼æ¥ ãå ¬éãã¦ããããããã¯ã·ã§ã³ã¬ãã£ãªãã¼ã¿åæç¨ã¯ã¼ã¯ããã¼æ§ç¯ãã¼ã«ã§ããçµæ§ããããªãã¤ãã©ã¤ã³ãã¼ã«ãããã¾ãããå ¨é¨ã触ã£ããã¨ãããããã§ã¯ãªãã§ãããä»ã®ã¨ãã*1Kedroã¯ããããããè¯ãæãã§ãã
ãã¼ã«ã®ç´¹ä»åç»ã¯ä»¥ä¸*2ã
å ¬å¼å¨ãã®ãµã¤ãã®URLã¯ä»¥ä¸ã®éãã
ç¹å¾´
ãã¡ããã¡ãããããã®ç¹å¾´ãããã¾ããã以ä¸ãããã¯ããã便å©ã§ãã
- Airflowãªã©ã¨åãããPythonã§å ¨ã¦ã®ã¯ã¼ã¯ããã¼ãæ¸ããã¨ãã§ãã
- DAGå½¢å¼ã§ãã¤ãã©ã¤ã³ãå®ç¾©ã§ãã Sequentialãªå®è¡ã¨Parallelãªå®è¡ã®åãæ¿ããããã¤ãã©ã¤ã³ã®éä¸ããå®è¡ãããªã©ã§ãã
- yamlã§å®ç¾©ãããã¨ãã§ãããã¼ã¿ã«ã¿ãã°ã®æ©è½ããããcsv, pickle, feather, parquet, DBä¸ã®ãã¼ãã« ãªã©æ§ã ãªãã¼ã¿å½¢å¼ã«å¯¾å¿ãããã¨ãã§ãã
- ãã¼ã¿ã»ãããå¦ç¿ã¢ãã«ããã¼ã¸ã§ã³ç®¡çããæå®ã®ãã¼ã¸ã§ã³ã§ãã¤ã§ãå®è¡ã§ããããåç¾æ§ãæ ä¿ãã
- Cookiecutter ã«ãããã³ãã¬ã¼ããå©ç¨ãããã¨ã§ãè¤æ°äººã§ã®ä½æ¥ã管çã§ãã
- ã¢ãã«ã®ãã©ã¡ã¼ã¿ã¼ãyamlã§ç®¡çãããã¨ãã§ãã
- Jupyter Notebook, Jupyter Lab ã¨ã®ã¤ã³ãã°ã¬ã¼ã·ã§ã³
- æ¬çªç°å¢ã¸ã®ç§»è¡ããããããè¤æ°ã®ç°å¢(AWS, GCPã®ãããªããã¼ã¸ããµã¼ãã¹ãªã©)ã«ãããã¤ãããã¨ãã§ãããå ·ä½çã«ã¯Pythonã®ããã±ã¼ã¸ã¨ãã¦ã§ãã£ãããkubeflowãArgo WorkflowsãAWS Batchã«ãããã¤ãªã©ãã§ãã
- ãªãããªãã¤ãã©ã¤ã³ã®å¯è¦åãKedro vizã§ã§ãã
ã¨ãããããããããã¨ã¯ä¸éãã§ããããªé°å²æ°ãæãåã£ã¦ããã ããã¨æãã¾ãç¬
Kedroã®å¤§ã¾ããªæ§æè¦ç´
Kedroã¯ã大ã¾ãã«ã¯æ¬¡ã®4ã¤ããæ§æããã¦ãã¾ãã
- Node
- Pipeline
- DataCatalog
- Runner
Node
- å®è¡ãããå¦çã®åä½ã§ãåå¦çãå¦ç¿ã¨ãã£ãå¦çæ¬ä½ã«ãªããã®
- å ¥åãåºåã®ãã¼ã¿ã»ããã¨ããããå¦çãããã¸ãã¯ãå®ç¾©ãã¦ããã¤ãã©ã¤ã³ã«çµã¿è¾¼ã
Pipeline
- Nodeã®ä¾åé¢ä¿ãå®è¡é åºã管çãããã®ã
- decoratoræ©è½ãkedroã«ã¯ãããããã«ãã£ã¦ããã¤ãã©ã¤ã³å ¨ä½ã®å¦çã«å¯¾ãã¦æ©è½ãä»å ãããã¨ãã§ãã
DataCatalog
- ãã¤ãã©ã¤ã³ã§ä½¿ç¨ãããã¼ã¿ãå®ç¾©ããã«ã¿ãã°
- ãã¼ã¿ã»ããåãå½¢å¼ããã¡ã¤ã«ãã¹ããã¼ããã»ã¼ãæã®ãªãã·ã§ã³ãªã©ãæå®ãããã¨ãå¯è½
Runner
- ãã¤ãã©ã¤ã³ãå®è¡ãããã®ããã©ã¡ã¼ã¿ã¼ãæå®ãã¦å®è¡ãããã¨ãã§ããä¾ãã°ç¹å®ã®ãã¤ãã©ã¤ã³ã ãå®è¡ããã¨ããã§ããã
- SequentialRunnerãParallelRunnerã®äºã¤ã®Runnerãããã
Install
æ®éã«pipã¨ãcondaã§ã¤ã³ã¹ãã¼ã«ã§ãã¾ãã
# pipã¨ãcondaã§ã¤ã³ã¹ãã¼ã«ã§ãã¾ã $ pip install kedro $ conda install -c conda-forge kedro # installããããã®ç¢ºèªãkedroã£ã¦ããæåãåºã¦ãããæå $ kedro info _ _ | | _____ __| |_ __ ___ | |/ / _ \/ _` | '__/ _ \ | < __/ (_| | | | (_) | |_|\_\___|\__,_|_| \___/ v0.16.6 kedro allows teams to create analytics projects. It is developed as part of the Kedro initiative at QuantumBlack. No plugins installed
æ°è¦ã®ããã¸ã§ã¯ããä½æãã
kedroã®ããã©ã«ãã®ãã³ãã¬ã¼ãã使ãå ´åã¯ã以ä¸ã®ã³ãã³ãã§ããã¾ããããã¤ã質åãåºã¦ãã¾ãããããããèªãã§çããã°ãªãã±ã¼ã
$ kedro new
ãã³ãã¬ã¼ãããªã³ã«ããã¨ã以ä¸ã®ãããªæãã§ãã©ã«ããæ§æããã¾ãã
$ tree . . âââ README.md âââ conf â  âââ README.md â  âââ base â  â  âââ catalog.yml â  â  âââ credentials.yml â  â  âââ logging.yml â  â  âââ parameters.yml â  âââ local âââ data â  âââ 01_raw â  â  âââ iris.csv â  âââ 02_intermediate â  âââ 03_primary â  âââ 04_feature â  âââ 05_model_input â  âââ 06_models â  âââ 07_model_output â  âââ 08_reporting âââ docs â  âââ source â  âââ conf.py â  âââ index.rst âââ kedro_cli.py âââ logs â  âââ journals âââ notebooks âââ setup.cfg âââ src âââ requirements.txt âââ sample â  âââ __init__.py â  âââ hooks.py â  âââ pipelines â  â  âââ __init__.py â  â  âââ data_engineering â  â  â  âââ README.md â  â  â  âââ __init__.py â  â  â  âââ nodes.py â  â  â  âââ pipeline.py â  â  âââ data_science â  â  âââ README.md â  â  âââ __init__.py â  â  âââ nodes.py â  â  âââ pipeline.py â  âââ run.py âââ setup.py âââ tests âââ __init__.py âââ pipelines â  âââ __init__.py âââ test_run.py
ããã¸ã§ã¯ããgitã®ç®¡çä¸ã«ããã«ã¯ã以ä¸ã®ãããªæãã§ããã¾ãã
$ git init $ git add ./ $ git commit -m "init" $ git branch -M main $ git remote add origin <hogehoge> $ git push origin main
ãã©ã«ãæ§æãããããã¨æãã¾ãããã©ãã«ä½ãæ¸ãããæ確ã§ããµã³ãã«ã®ã³ã¼ãã追ããããã ãã§ã大ä½ä½ããã°ããããããã¾ãç¬*3
ã¨ããããåããã¦ã¿ã
kedro new
ãããã¨ãã«ãirisã®ãã¼ã¿ã»ããã§ãµã³ãã«ã®ã¢ãã«ãåããããã®ãã¤ãã©ã¤ã³ããã¼ãããã¼ã¿ã«ã¿ãã°ãç¨æããã¦ããã®ã§ãã¨ããããããã¤ããã¼ã«ã«ã§åããã¦ã¿ããã¨æãã¾ãã
$ cd <ããã¸ã§ã¯ãã«ã¼ã> # ã¾ãã¯ããã¸ã§ã¯ãã®ä¾åé¢ä¿ãã¤ã³ã¹ãã¼ã« $ kedro install # å®è¡ $ kedro run
ããã ãã§ããç°¡åã§ãããå®è¡ããã¨ãlogã®ãã©ã«ãã«ãã°ãåãåºãããã®ããããã¾ãã
次ããã¯ãå¦çãå®éã«è¿½å ãã¦ããã®ãã©ããããè¦ã¦ããã¾ãã
ãã¼ã¿ã½ã¼ã¹ã追å ãã(Data Catalogã追å ãã)
Nodeãªã©ã§ãã¼ã¿ãå®éã«ä½¿ãããã«ããã¼ã¿ã«ã¿ãã°ã«ãã¼ã¿ã½ã¼ã¹ã追å ãã¾ããconf/base/catalog.yml
ã¨ãããã¡ã¤ã«ã«è¨è¿°ãã¦ããã¾ãã
csvã¨ãã§ããã°ã以ä¸ã®ãããªæãã§è¿½å ã§ãã¾ããä»ã«ããxlsxãparquet, sqlTableãªã©æ§ã ãªãã¼ã¿ã½ã¼ã¹ã«å¯¾å¿ã§ãã¾ãã
companies: type: pandas.CSVDataSet filepath: data/01_raw/companies.csv reviews: type: pandas.CSVDataSet filepath: data/01_raw/reviews.csv
data
ã®ãã£ã¬ã¯ããªã«ãããããã®ãã¼ã¿ã®ç¶æ
ã«åããã¦ãã©ã«ãã«ãã¼ã¿ãå
¥ãã¾ããããã辺ã人ã«ãã£ã¦ãã©ã«ãã®åãæ¹ãå¥ãããã¨ãã»ã¨ãã©ã§ãããäºåã«å®ç¾©ãããã¦ããã®ã§ãããããã§ãã
å¦çã追å ãã(Nodeãç·¨éãã)
nodeã®å¦çã¯åç´ã§ãæ®éã«é¢æ°ã追å ããã ãã§ã(å®)
試ãã«ããã³ãã¬ã¼ãã§åºã¦ããnodes.pyã®å¦çãè¦ã¦è¦ã¾ãã
from typing import Any, Dict import pandas as pd def split_data(data: pd.DataFrame, example_test_data_ratio: float) -> Dict[str, Any]: """Node for splitting the classical Iris data set into training and test sets, each split into features and labels. The split ratio parameter is taken from conf/project/parameters.yml. The data and the parameters will be loaded and provided to your function automatically when the pipeline is executed and it is time to run this node. """ data.columns = [ "sepal_length", "sepal_width", "petal_length", "petal_width", "target", ] classes = sorted(data["target"].unique()) # One-hot encoding for the target variable data = pd.get_dummies(data, columns=["target"], prefix="", prefix_sep="") # Shuffle all the data data = data.sample(frac=1).reset_index(drop=True) # Split to training and testing data n = data.shape[0] n_test = int(n * example_test_data_ratio) training_data = data.iloc[n_test:, :].reset_index(drop=True) test_data = data.iloc[:n_test, :].reset_index(drop=True) # Split the data to features and labels train_data_x = training_data.loc[:, "sepal_length":"petal_width"] train_data_y = training_data[classes] test_data_x = test_data.loc[:, "sepal_length":"petal_width"] test_data_y = test_data[classes] # When returning many variables, it is a good practice to give them names: return dict( train_x=train_data_x, train_y=train_data_y, test_x=test_data_x, test_y=test_data_y, )
ãã®å¾åºã¦ãããã¤ãã©ã¤ã³ã¨ãã¼ã¿ã®å ¥åºåãåãããå¿ è¦ãããã¾ããããã以å¤ã¯æ®éã®é¢æ°ã®å¦çã§ãããã¨ããããã¨æãã¾ããæ¸ãå ´æãããç¨åº¦ç¸ãã ãã§ãããã辺ã«ç¹ã«ã«ã¼ã«ããªãã®ã¯å©ããã¾ããã
ãã¤ãã©ã¤ã³ãç·¨éãã
ãã¤ãã©ã¤ã³ã®å¦çã§ç·¨éããã®ã¯äºç®æã§ãpipeline.py
ã¨hooks.py
ã®äºã¤ã§ãã
pipeline.py
ã¾ãpipeline.py
ã§ãããèªåã§å®è£
ããnodeã®é¢æ°ããkedro.pipeline.node
ã使ã£ã¦ããã¤ãã©ã¤ã³ã«çµã¿è¾¼ã¿ã¾ããçµã¿è¾¼ãã¨ãã«ã¯ã第ä¸å¼æ°ã«é¢æ°ã第äºå¼æ°ã«å
¥åã§æ¸¡ããã¼ã¿ã第ä¸å¼æ°ã«åºåã®ãã¼ã¿ã渡ãã¾ãã
from kedro.pipeline import Pipeline, node from .nodes import split_data def create_pipeline(**kwargs): return Pipeline( [ node( split_data, ["example_iris_data", "params:example_test_data_ratio"], dict( train_x="example_train_x", train_y="example_train_y", test_x="example_test_x", test_y="example_test_y", ), ) ] )
ããã ãã§ãã
hooks.py
pipeline.py
ã§ä½ã£ããã¤ãã©ã¤ã³ãå®è¡ã§ããããã«ãã¾ããã¾ãããã¤ãã©ã¤ã³éã«ä¾åé¢ä¿ãããå ´åãã»ã¨ãã©ã ã¨æãã¾ãã®ã§ããã®å¦çãæ¸ãã¾ãã
from typing import Any, Dict, Iterable, Optional from kedro.config import ConfigLoader from kedro.framework.hooks import hook_impl from kedro.io import DataCatalog from kedro.pipeline import Pipeline from kedro.versioning import Journal from ab_recommender.pipelines import data_engineering as de from ab_recommender.pipelines import data_science as ds class ProjectHooks: @hook_impl def register_pipelines(self) -> Dict[str, Pipeline]: """Register the project's pipeline. Returns: A mapping from a pipeline name to a ``Pipeline`` object. """ data_engineering_pipeline = de.create_pipeline() data_science_pipeline = ds.create_pipeline() return { "de": data_engineering_pipeline, "ds": data_science_pipeline, "__default__": data_engineering_pipeline + data_science_pipeline, } @hook_impl def register_config_loader(self, conf_paths: Iterable[str]) -> ConfigLoader: return ConfigLoader(conf_paths) @hook_impl def register_catalog( self, catalog: Optional[Dict[str, Dict[str, Any]]], credentials: Dict[str, Dict[str, Any]], load_versions: Dict[str, str], save_version: str, journal: Journal, ) -> DataCatalog: return DataCatalog.from_config( catalog, credentials, load_versions, save_version, journal ) project_hooks = ProjectHooks()
ããã ãã§ãå®è¡é åºã®ä¾åé¢ä¿ãè¨è¿°ãããã¨ãã§ãã¾ãã
ã¾ã¨ã
以ä¸ããã£ããã¨ããkedroã®ä½¿ãæ¹ã§ããã¾ã 試ãã¦ããªãã®ã§ãããkedroã®ãã¬ã¼ã ã«åã£ã¦æ¸ãã¦ããã°ãAWS Batchãkubeflowãªã©ã«ãããã¤ãããã¨ãã§ããããã«ãªããªã©ãä»ã«ãè¯ãããªæãã®ç©ãå¤ãã®ã§ã使ã£ã¦ã¿ããã¨æãã¾ãã
ããã§ã¯ï¼
åè
ãAWSãAWS Data Pipelineã®startDateTimeã®æå®ã§ç大ã«ããã£ã話
ããã«ã¡ã¯ã
ä»æ¥ã¯ãAWS Data Pipelineã使ã£ã¦ãã¦startDateTimeã®æå®ã§ç大ã«ããã£ãã®ã§ããã®è©±ãæ¸ãã¾ãã
ããããAWS Data Pipelineã£ã¦ä½ï¼
ã£ã¦æ¹ã¯ã以ä¸ã®è¨äºãã©ãããã¾ãããã®è¨äºèªãã§ãã人ã¯ããã«ã¤ãã¦ã¯ç¥ã£ã¦ããã ãããã©ã
ä»æ¥è©±ããã¨ä»¥å¤ã«ãããããã¤ã³ãã¯æ°ç¥ãããªã®ã§ããããä¾é¤ã
åæ
ç¾å¨devç°å¢ã¨prodç°å¢ã®äºã¤ãç¨æãã¦ãã¦ãããã§åããã¤ãã©ã¤ã³ã®å 容ãåããã¦ããã®ã§ãããdevã§ãã¹ãããã¦prodã«ãªãªã¼ã¹ããã¾ããã
devã§ãªãªã¼ã¹ããæ¥ã¨prodã§ãªãªã¼ã¹ããæ¥ã¯éãã¾ãããpipelineã®å®ç¾©ãªã©ã¯å ¨é¨ä¸ç·ã§ãã
ãã®ãªãªã¼ã¹ããæ¥ãéãã¨ããã®ããã¤ã³ãã§ãã
ããã£ããã¨
prodã«ãªãªã¼ã¹ããã¦ãæ°æ¥ããã£ãããæ¥ããã£ããªãããã¼ã¿èµ·å ã®ã¨ã©ã¼ãåºã¦ãããã¨ãããã¨ã§ãããç´ã対å¦ããã¾ããã
ã§ããããdevã§ãã¹ããã¦ãããprodã§ãªãªã¼ã¹ãããã¨ãããã¨ã§ãcliçµç±(æ£ç¢ºã«ã¯CIã§ãªãªã¼ã¹)ã§prodã«ãªãªã¼ã¹ãã¾ããã
ããããã次ã®ãããªã¨ã©ã¼ãåºãããã§ãã
{ "validationErrors": [ { "errors": [ "startDateTime can not be changed" ], "id": "DefaultSchedule" } ], "errored": true }
ããï¼startDateTimeãªãã俺å¤ãã¦ãªããã ãã©ã
ã£ã¦ãªã£ããã§ãããã¡ãªã¿ã«ã該å½ã¨ãªãjsonã¯ä»¥ä¸ã
{ "period": "1 days", "startDateTime": "2020-10-20T21:30:00", "name": "Every 1 day", "id": "DefaultSchedule", "type": "Schedule" }
ã§ãããã¼ï¼ãã¿ãããªæãã§æ©ãã§ãããã§ããã©ãä½åã³ã³ã½ã¼ã«ãã³ã¼ãã®diffãè¦ã¦ãåãæéãªãã§ãããåãã³ã¼ãã«ãªã£ã¦ããdevã§ãªãªã¼ã¹ãã¦ãã¨ã©ã¼ã¯åºãªãã
ã§ãä¸åæãã¤ããã®ããdevã§ãªãªã¼ã¹ãªãªã¼ã¹ããæ¥ã¨prodã§ãªãªã¼ã¹ããæ¥ãéããªãã£ã¦ããã®ããã£ããã§ããã
ã¾ãããªã£ã¦æã£ã¦ãprodã§ãªãªã¼ã¹ãã¦æåã«å®è¡ããæéãpipelineã«æå®ãã¦ãã£ã¦ã¿ãããä»åº¦ã¯ãã¾ãããã¾ããã
ã¨ãããã¨ã§ç¥è¦
以ä¸ãä¾é¤ããç¥è¦ã§ãã
- aws data pipelineã§ã¯ãã©ãããæåã«aws datapipelineã®ä¸èº«ããªãªã¼ã¹ããã¨ãã«ãéå»ã®æ¥ä»ãæåã®å®è¡éå§æ¥æã¨æå®ããå ´åããªããå é¨çã«ã¯æåã«ã¹ã±ã¸ã¥ã¼ã«ãå®è¡ããæ¥æã«ãªãããã
- ãªã®ã§ãaws data pipelineä¸ã§ããã¤ãã©ã¤ã³ãå®ç¾©ãã¦ããjsonãããyyyymmddã®hhæmmåã«å®è¡ãã¾ãããã£ã¦æ¸ããã¦ãã¦ããããã£ã¦AWS Data pipelineã®export pipeline definitionãã³ã³ã½ã¼ã«ä¸ã®å¤ãè¨ã£ã¦ãã¦ããæåã®å®è¡æå»isãã¤ãè¦ã¦ããããã¤ããªãã¨ãããªã
- å®éã«ããªãªã¼ã¹ä½æ¥ããã次ã®æ¥ã®å®è¡æå»ã試ãã«ãã¹ããã¦ã¿ãããã§ãã
ããã¯å®å ¨ã«ç½ ã ã£ãã®ã§ã誰ãã®å½¹ã«ç«ããªãããã ãã©ãæ¸ãã¦ããã¾ãã
ããã§ã¯ã
ãGCPãServerless frameworkã使ã£ã¦Cloud Functionãä½ã
ããã«ã¡ã¯ã
æè¿ãServerless Frameworkã使ã£ã¦Cloud functionãä½ãæ©ä¼ããã£ãã®ã§ããã¡ãã«ã¤ãã¦ã¾ã¨ãã¦ããã¾ãã
Serverless Frameworkã¨ã¯
Serverlessã¢ããªã±ã¼ã·ã§ã³ã§ããLambdaãCloud Functionãæ§æ管çããããããã¤ãããããã¼ã«ã«ã§è©¦ãããããããã®ãã¼ã«ã§ãã
以ä¸ãå ¬å¼ãµã¤ããå¤å°ã§ãããã¨ãææ°ã®ç©ã«å¯¾ãã¦è¶³ããªãã£ãããã¾ãããã¡ã¤ã³ã®ä½¿ãæã«å¯¾ãã¦ã¯ãåé¡ãªãæ©è½ãããã®ã§ãããã¥ã¡ã³ãèªãã§ãããããã¨ãã§ãããã確èªãã¦ããã»ããè¯ãããªã¨æãã¾ãã
å¦ç¿ã³ã¹ããå°ãªã対å¿ãããã¨ãã§ããã®ã§ãããããã§ãã
使ãåã®æºå
serverless frameworkã¯ãnode.jsã§æ¸ããã¦ãã¾ãããªã®ã§ãnodeãã¤ã³ã¹ãã¼ã«ãã¦ããã¾ãããã
å人çã«ãããããªã®ã¯ãnvmã§nodeã®ãã¼ã¸ã§ã³ã管çãã¦ãããã¨ã以ä¸ã¯ãCentosã«å ¥ããã¨ãã®ããã°ã§ãããmacã¨ãã§ãã»ã¨ãã©åããªã®ã§ãåèã«ãªãããªã¨ã
ã¤ã³ã¹ãã¼ã«
ããã§ã¤ã³ã¹ãã¼ã«ãããã¨ãã§ãã¾ãã
# servelessã®ã¤ã³ã¹ãã¼ã« $ npm install -g serverless # ã¤ã³ã¹ãã¼ã«ã®ç¢ºèª $ serverless --version
使ã£ã¦ã¿ã
ãããã¨
å®éã«ãããã¤ã¾ã§ã¯ãã¾ãããã以ä¸ã®ã³ãã³ããç´¹ä»ã
- GCPã®cloud functionããpythonã®runtimeã§å®è¡ããããã®templateãä½æãã
- deployããã
ãã£ã¦ã¿ã
templateã¨ãªããã¡ã¤ã«ãä½æããã«ã¯ã以ä¸ã®ã³ãã³ããæã¤ãã¨ã§å¯¾å¿ã§ãã¾ãã
$ serverless create --template google-python
ãã®ã³ãã³ããæã¤ã¨ã次ã®4ã¤ã®ãã¡ã¤ã«ãçæããã¾ãã
- .gitignore
- main.py
- package.json
- serverless.yml
ãã¨ã¯ãããã®main.pyã«èªåãæ¸ãããé¢æ°ãæ¸ããããserverless.ymlã«functionã®è¨å®ãæ¸ã足ãã¦ãããããªæãã§æ¸ããã¨ãã§ãã¾ãã
ä¾åé¢ä¿ãã¤ã³ã¹ãã¼ã«ããããã«ã以ä¸ã®ã³ãã³ããã¤ã³ã¹ãã¼ã«ãã¾ãã
$ npm install
å¤æ´ç®æãå¤æ´ããããdeployã¯ä»¥ä¸ã®éãã
$ serverless deploy
ããã ãã§åºæ¥ã¡ããã¾ããç°¡åã§ãããç´°ããè¨å®ãªã©ã¯ãããã¥ã¢ã«è¦ãªããããã°ã§ãã¡ããã®ã§ãããã§ã¯å²æãã¾ãã
注æç¹
ãããããã¨ãã®æ³¨æç¹ã§ãããäºåã«deployment managerã®APIãæå¹åãã¦ããå¿ è¦ãããã¾ããã¾ããService Accountãåå¾ããå¿ è¦ããããªã©ã®ç¹ãçæäºé ã§ãã
ãAWSãAWS Data Pipelineã§ãã©ã¤ãã¼ããµããããå ã«ããDB(RDS)ãæä½ããã®ã«ç大ã«ããã£ãã®ã§ã¾ã¨ãã
ããã«ã¡ã¯ã
ä»æ¥ã¯ãData Pipelineã§ãã©ã¤ãã¼ããµããããå ã«ããDBãæä½ããå ´åã®å¯¾å¦æ¹æ³ã«ã¤ãã¦ã¯ã¾ã£ãã®ã§ãæ¸ãã¦ã¿ããã¨æãã¾ãã
AWS Data Pipelineã¨ã¯
AWS Data Pipelineã«ã¤ãã¦ã¯ã以åè¨äºã«ãã¦ããã®ã§ã以ä¸ã®è¨äºãã覧ãã ãã*1ã
ã¯ã¾ã£ããã¨
ãã®è¨äºãæ¸ããã¨æã£ãåæ©ã«ããªãã®ã§ãããAWS Data Pipelineã§EC2ã¤ã³ã¹ã¿ã³ã¹ãå®è¡ç°å¢ã«æå®ãã¦ãActivityãåãããã¨ãã¦ããã®ã§ããããã®éã«ã¨ããVPCå ã®ãã©ã¤ãã¼ããµããããä¸ã«ããDBãæä½ããå¿ è¦ãããã¾ããããã®æã«ããã¼ã¿ãã¼ã¹ã«ã¢ã¯ã»ã¹ã§ããªãã©ãããããAWS Data pipelineã®APIãå©ç¨ã§ããªãã¨ããã¨ã©ã¼ãåºã¦ãã¾ãã¾ããã
ããããèãããå½ããåã£ã¡ãå½ããåã®ãã¨ãããªãã¨ãããªãã£ããã§ãããã¨ã©ã¼å 容ãªã©ãåºã¦ããããããï¼ãªãã§åããªãã®ï¼ãã£ã¦ãªã£ã¦ãæ°æéããã¼ãã£ã¦æ©ãã§ãã¾ã£ãã®ã§ããã«åå¿é²ã¨ãã¦æ®ãã¾ãã
ã¤ã¾ããããããã¨
ã¤ã¾ããããããã¨ãã¾ã¨ããã¨ã以ä¸ã®ãããªæãã«ãªãã¾ããé常ã®Webã¢ããªã±ã¼ã·ã§ã³ãªã©ã®å ´åããã«ãAZã®æ§æã«ããã¨æãã¾ãããããã§ã¯ã·ã³ãã«ã«ããããã«ä¸ã¤ã ãæ¸ãã¦ã¾ãã
å³ä¸ã®EC2ã¤ã³ã¹ã¿ã³ã¹ã¯ãData Pipelineã®å®è¡ç°å¢ã¨ãã¦ãData Pipelineãä½ã£ããã®ã¨ãã¦ã¿ã¦ãã ããããã®EC2ã¤ã³ã¹ã¿ã¹ããããã©ã¤ãã¼ããµããããã«ããDBã«ã¢ã¯ã»ã¹ã§ããããã«ãããã£ãæãã§ããã¡ãªã¿ã«ããã®DBã¯RDS(Aurora)ã§ãã
ãã®ã±ã¼ã¹ã«èµ·ãã¦èµ·ãã£ã¦ããåé¡
ãã®ã±ã¼ã¹ã§èµ·ãã£ã¦ããåé¡ã¯ããã¤ãããã¾ãã¦ã以ä¸ã®5ã¤ããã¾ããããªãã¨åãã¾ããæ±ã
- â ï¼ãã°ããã¼ã¿ãåãåºãS3ã«EC2ã¤ã³ã¹ã¿ã³ã¹ãã¢ã¯ã»ã¹ã§ãã
- â¡ï¼EC2ã¤ã³ã¹ã¿ã³ã¹ããå¤é¨ã®ãããã¯ã¼ã¯ã«ã¢ã¯ã»ã¹ã§ãã
- â¢ï¼EC2ã¤ã³ã¹ã¿ã³ã¹ããRDSã«ã¢ã¯ã»ã¹ã§ãã
- â£ï¼DriverãAuroraç¨ã«æå®ãã
- â¤ï¼å®è¡ã«å¿ è¦ãªãã¼ã«ãã¢ã¿ããããã¦ãã
ã¾ãæ®éã«å³ãæ¸ããªããããEC2ãããã«é ç½®ããã¦ããã£ã¦èããã°å½ããåã ã£ãã®ã§ãããData Pipelineããããªã«ããã®ããªã¨æã£ã¦ãã¾ã£ãé¨åããªããªãæãåºãããæéãé£ã£ã¦ãã¾ãã¾ããã
ä»åã®åé¡ã¸ã®è§£æ±ºç
ããããã®åé¡ã«å¯¾ãã解決çãæ¸ãã¦ããã¾ãã
â ï¼ãã°ããã¼ã¿ãåãåºãS3ã«EC2ã¤ã³ã¹ã¿ã³ã¹ãã¢ã¯ã»ã¹ã§ããããã«ãã
ä»åã¯ããã©ã¤ãã¼ããµããããããS3ã«ã¢ã¯ã»ã¹ã§ããå¿ è¦ãããã¾ããããã«ã¤ãã¦ã¯ã以ä¸ã®è¨äºã§ããããã«ãVPC endpointã®ãµã¼ãã¹ã使ãã¾ããã
ããã®è¨å®èªä½ã¯ããã¾ã§é£ãããªãã£ãã®ã§æé éãè¡ããã¨ã§ã§ãã¾ããã
â¡ï¼EC2ã¤ã³ã¹ã¿ã³ã¹ããå¤é¨ã®ãããã¯ã¼ã¯ã«ã¢ã¯ã»ã¹ã§ãã
ECSãåãããã¨ããæã«ãECRã«ã¢ã¯ã»ã¹ã§ããªãã¦Imageããã«ã§ããªãã¿ãããªã¨ã©ã¼ãåºãã®ã¯çµé¨ãããã¨ããããã§ãããããã¨ä¼¼ããããªãã¨ã§ããã
å ·ä½çã«ã¯ã以ä¸ã®ããã«EC2ã®ã¤ã³ã¹ã¿ã³ã¹ããdatapipelineã®APIã使ããã¨ãããã443ã®ã¿ã¤ã ã¢ã¦ãã¨ã©ã¼ãèµ·ããã¨ãããããªãã¨ã§ãã
private.com.amazonaws.http.AmazonHttpClient: Unable to execute HTTP request: Connect to datapipeline.ap-northeast-1.amazonaws.com:443 timed out
ãã®ããã«ãData Pipelineãåããæã«ãAWSã®Data Pipelineã®APIãEC2ã¤ã³ã¹ã¿ã³ã¹ãã使ããããã«ããå¿ è¦ãããã¿ããã§ããããã§ããããã«è¨å®ããå¿ è¦ãããã¾ããããã«ã¤ãã¦ã¯ãNAT Gatewayã使ã£ã¦ãã©ã¤ãã¼ããµããããã«ããEC2ã¤ã³ã¹ã¿ã³ã¹ããå¤ã®ã¤ã³ã¿ã¼ãããã«ã¢ã¯ã»ã¹ã§ããããã«ãã¾ããã
ããã«ã¤ãã¦ã¯ã以ä¸ã®è¨äºããã¼ã¹ã«ãã£ãããã¾ãããã¾ããã
EC2ã®ã»ãã¥ãªãã£ã°ã«ã¼ãã®Outboundãããã§å ¨ã¦ã®ãã©ãã£ãã¯ã許å¯ãã¦ãAPIã¨ãã使ãéã«å¤ã«åºãããããã«ãã¾ããã
â¢ï¼EC2ã¤ã³ã¹ã¿ã³ã¹ããRDSã«ã¢ã¯ã»ã¹ã§ãã
ããã«ã¤ãã¦ã¯ãåããã©ã¤ãã¼ããµããããã«ããEC2ããRDSã«ã¢ã¯ã»ã¹ã§ããããã«ããã°ããã ããªã®ã§ãRDSã®ã»ãã¥ãªãã£ã°ã«ã¼ãã®Inboundã®è¨å®ã«ãEC2ã®ã»ãã¥ãªãã£ã°ã«ã¼ãã許å¯ãã¦ãããã°è§£æ±ºãã¾ã*2ã
â£ï¼DriverãAuroraç¨ã«æå®ãã
ä½ãèããã«ãRDSã®ãã¼ãã«ã«ã¢ã¯ã»ã¹ãããã¨ããã¨ã以ä¸ã®ãããªã¨ã©ã¼ãèµ·ããã¾ãã
DriverClass not found for database:aurora
ãã®ã¨ã©ã¼ã®å¯¾å¦æ¹æ³ã«ã¤ãã¦ã¯ã以ä¸ã®è¨äºã«å©ãããã¾ããã
Auroraã®å ´åã¯ãã¡ãã£ã¨ããããã¤ã³ãããã£ã¦ã以ä¸ã®è¨äºã«å©ãããã¾ããã
â¤ï¼å®è¡ã«å¿ è¦ãªãã¼ã«ãã¢ã¿ããããã¦ãã
Data Pipelineã使ãéã«ã¯ãããã©ã«ãã§DataPipelineDefaultRoleã¨DataPipelineDefaultResourceRoleã®äºã¤ã®ãã¼ã«ã使ããã¾ããããããã®å½¹å²ã¯ã次ã®éãã§ãã
- DataPipelineDefaultRoleï¼AWS Data Pipeline ã« AWS ãªã½ã¼ã¹ã¸ã®ã¢ã¯ã»ã¹ã許å¯ãã
- DataPipelineDefaultResourceRoleï¼ã¢ããªã±ã¼ã·ã§ã³ã« AWS ãªã½ã¼ã¹ã¸ã®ã¢ã¯ã»ã¹ãä»ä¸ãã
ãã®ããããã®ãã¼ã«ã«ãä»å使ãã«ããã£ã¦å¿ è¦ãªãã¼ã«ãã¢ã¿ããããå¿ è¦ãããã¾ãã以ä¸ã®ããã¥ã¢ã«ã«æ¸ãã¦ããéãã«è¨å®ããã°åºæ¬çã«ã¯å¤§ä¸å¤«ã§ãã
ãã ããç§ã¯ãªããèªåã§ãã¼ã«ã¨ããªã·ã¼ãããã¥ã¢ã«ã§ä½æãã¦ããã®ãã¼ã«ã使ãããã«æå®ãããã以ä¸ã®ãããªWarningãåºã¾ããããªã®ã§ãããã©ã«ãã§ç¨æããã¦ãããã¼ã«ã使ãããã«ãã¾ããã
WARNING: 0 policies attached to the role - unable to validate policy for 'DataPipelineDefaultRole' as exactly one policy is expected when default policy is missing. WARNING: 0 policies attached to the role - unable to validate policy for 'DataPipelineDefaultResourceRole' as exactly one policy is expected when default policy is missing.
ãã®ä»ï¼å®è¡ã§ããããã«ãªã£ã¦ãWarningãåºç¶ãã
ã¶ã£ã¡ãããã¡ããã¨ãã¼ã«ãè¨å®ãã¦ãã以ä¸ã®ã¨ã©ã¼ãç§ã¯åºç¶ãã¾ãããåºæ¬çã«ã¯Warningãæ½°ãã¦ããData PipelineãActivateããå¿ è¦ãããã®ã§ããã以ä¸ã®Warningã¯ç¡è¦ãã¦ãå®è¡ã§ãã¾ããorzã
WARNING: Could not validate S3 Access for role. Please ensure role ('DataPipelineDefaultRole') has s3:Get*, s3:List*, s3:Put* and sts:AssumeRole permissions for DataPipeline.
è²ã ã°ã°ã£ã¦ããã¨ãä¼¼ããããªãã¨ã«å°ã£ã¦ããforumãstackoverflowã«åºä¼ãã¾ããã解決ããã¦ããªããã®ã°ããããã
ãã®ä»¶ã«ã¤ãã¦ã¯è«¦ãã¦ãã¾ããã誰ããã¬ãã¸ãããæ¹ãããã°æãã¦ãã ãããããã
ã¾ã¨ã
ããã¾ã§ã®ã¾ã¨ãã以ä¸ã§ãã
- AWS Data Pipelineã§ä½ãããEC2ã¤ã³ã¹ã¿ã³ã¹ã®æ±ãã¯ãæ®éã®EC2ã¤ã³ã¹ã¿ã³ã¹ã¨åããªã®ã§ãè¨å®ã®ç¢ºèªãªã©ã¯ãåããã©ã¤ãã¼ããµããããä¸ã«åãã»ãã¥ãªãã£ã°ã«ã¼ãã§EC2ã®ã¤ã³ã¹ã¿ã³ã¹ãç«ã¦ã¦ä¸éãå¦çãã§ãããã確èªããã誰(èªåãä½ãããAWS Data Pipelineã®ã©ã¡ãã)ãä½ããã®éãã§ãããªãã
èãã¦ã¿ãã°ã¯ã¼ã¯ããã¼ãã¸ã§ãã®å®è¡ç¶æ ã管çããã®ãData Pipelineã®ã¡ã¤ã³ã®ã¿ã¹ã¯ãªã®ã§ãå®è¡ç°å¢ã§ããEC2ã®è¨å®ãªã©ã¯ã»ãã¨åæ§ã«ããªãã¨ãããªãã£ãã¨ããæãã§ããã
以ä¸ã§ããããã§ã¯ã
ãAWSãAWS Data Pipelineå ¥é
ããã«ã¡ã¯ã
æè¿ä»äºã§AWS Data Pipelineã使ãæ©ä¼ããã£ãã®ã§ããã®æ©è½ã«ã¤ãã¦ã¾ã¨ãã¾ãã
AWS Data Pipelineã¨ã¯
AWS Data Pipelineã¨ã¯ãä¸è¨ã§è¨ãã¨AWSãæä¾ããAirflowãã¿ãããªæãã«ãªãã¨æãã¾ãã
AWSã®S3ãDynamoDBãRedshiftãªã©ã¨ãã£ã代表çãªãã¼ã¿ã½ã¼ã¹ã¨ãé£æºãã§ããcronã®ãããªã¸ã§ãã¹ã±ã¸ã¥ã¼ãªã³ã°æ©è½ããã¸ã§ãã®ä¾åé¢ä¿ãå®è¡ç°å¢ã®å®ç¾©ãã¸ã§ãã®å®è¡ã®ããã®ãªã½ã¼ã¹(EC2ã»EMR)ã®èµ·åã¨åæ¢ãã³ã³ã½ã¼ã«ä¸ããå¦çå®è¡ã®Statusã®ç¢ºèªã»ãªãã©ã¤ãã§ãã¾ã*1ã
ãã¡ãããã«ããã¼ã¸ããªã®ã§ãcronã®ç£è¦ã¿ãããªããã¸ã§ãã®ç£è¦ã®ç£è¦ãã¿ãããªãã¨ãããªãã¦ããã§ããç¬
å ¸åçãªã¦ã¼ã¹ã±ã¼ã¹ã«å¯¾ãã¦ã¯ãããã«è©¦ããããã«Templateãç¨æããã¦ãã¦ãä¾ãã°RDSããS3ã«ãã¼ã¿ãã³ãã¼ãããå ´åãªã©ã¯ãããã¤ãã®è¨å®ãããã ãã§ãå®ç¾ãããã¨ãã§ãã¾ãã
å©ç¨ã·ã¼ã³(Airflowã¨ãã¨ã®éã)
ããã¾ã§ã®è©±ã ãèãã¨ãããããAirflowã¨ä½ãéãã®ï¼Airflowãããã¡ãªã®ï¼ãã£ã¦ãã¨ã«ãªãã¨æãã®ã§ããã使ã£ã¦ããææã¨ãã¦ã¯ãã使ã£ã¦ããã¯ã©ã¦ããAWSã«éãã¦ãã¦ãä¸ã¤å°è¦æ¨¡ã«ãã¼ã¿ãã¤ãã©ã¤ã³ãä½ãããå ´åãã«ã¯ãAWS Data Pipelineã¯ããããªã¨æãã¾ããã¾ããPythonãªã©ããã¼ã ã®ã¡ã¤ã³ã®è¨èªã§ã¯ãªãå ´åããå®ç¾©ãã¡ã¤ã«ã¯Jsonã§ã³ã¼ãã¯ã»ã¼SQLã¨ã·ã§ã«ãã好ããªè¨èªã使ãã¨ããã§ããã®ã§ãAirflowãªã©ã«ä»£è¡¨ãããPythonã§DAGãå®ç¾©ãããªã©ããªããæå¹ã§ãã
å©ç¨ã·ã¼ã³ã¨ãã¦ã¯ã以ä¸ã®è¨äºã«ããããã«ããã¼ã¿ãRDSã«å ¥ã£ã¦ãã¦ãRDSããS3ã«Data Pipelineã§ãã¼ã¿ãå®å¸¸çã«æã£ã¦ãã¦ãããããAthenaã§åæããã¨ãã£ãå ´åã¯æå¹ããªã¨æãã¾ãã
ãã®ã»ãã«ããä¾ãã°ä»¥ä¸ã®ãããªã·ãã¥ã¨ã¼ã·ã§ã³ãèãããã¾ãã
- è¤æ°ã®ãã¼ã¿ã½ã¼ã¹ã«ãããã¼ã¿ããå´åããããã«ãã¾ã¨ãã¦ãå®å¸¸çã«åæãããå ´å
- AWS Data Pipelineã®Templateã§ç¨æããã¦ãããããªå ¸åçãªãã¼ã¿ãã¤ãã©ã¤ã³ã«å ãã¦ããã©ã¹Î±ã®å¦çå 容ã§æ¸ã¿ãããªå ´å
- è¤éãªãã¼ã¿ãã¤ãã©ã¤ã³ãä½ãã¾ã§ã®è¦æ¨¡ã§ã¯ãªããããããå°äººæ°ã§ãã¼ã¿ãã¤ãã©ã¤ã³ã®æ§ç¯ã»éç¨ãããå¿ è¦ãããå ´å
使ã£ã¦ããæè¦ã§ãããæ¡ä»¶ã«å¿ããåçãªãã¤ãã©ã¤ã³ã®å¶å¾¡ããã¿ã¹ã¯ã®ä¾åé¢ä¿ã大ãããªã£ãå ´åãªã©ã«ããã¦ã¯ãèªåã§Airflowãªã©ãç«ã¦ã¦éç¨ãããªã©ãè¯ãã¨æãã¾ãã大è¦æ¨¡ã«ãªã£ã¦ããã¨ãã¸ã§ãã®ä¾åé¢ä¿ãªã©ãææ¡ããã®ããã©ã¤ã¨æãã¾ã*2ã
ã¡ãã£ã¨çãããã®ãäºå®ã§ããã1人ã2人ã§ã¡ã¤ã³ã§éç¨ããªãã¨ãããªãã¦ããã¼ã¿ã½ã¼ã¹ãããã»ã©å¤ããªãããã¨ãããããã¤ãã©ã¤ã³ãä½ããã°ããã¿ãããªã·ãã¥ã¨ã¼ã·ã§ã³ã«ããã¦ã¯ãData Pipelineã¯æå¹ããªã¨æãã¾ãã
æé
æéã¯ä»¥ä¸ãããä¾ãã°æ±äº¬ãªã¼ã¸ã§ã³ã§ã1æ¥ã«ä¸åãããã®é »åº¦ã§ããã°ãç´0.57$ãªã®ã§ããã¡ããã¡ãå®ãã§ããããã«å ãã¦ãå®è¡æã«ä½¿ç¨ããEC2ãEMRã®ä»£éã¯å¥ã§ãããã¾ãã
å®éã«ãã£ã¦ã¿ã
0. ä»åä½ããã¤ãã©ã¤ã³
ä»åã¯ãRDSã«ãããã¼ã¿ãã¼ã¹ã®ãã¼ãã«ãS3ã«ã³ãã¼ãããã¤ãã©ã¤ã³ãå®éã«ä½ã£ã¦ã¿ããã¨æãã¾ãã
1. ãã¼ã¿ãã¤ãã©ã¤ã³ã®åæè¨å®ããã
ããã§ã¯ãå®éã«ãã¼ã¿ãã¤ãã©ã¤ã³ãä½ã£ã¦ã¿ã¾ããã¾ãã¯ãã³ã³ã½ã¼ã«ä¸ãããdata pipelineã®ãµã¼ãã¹ã®ã¨ããã«é£ã³ã¾ããåãã¦ä½ãå ´åã¯ã以ä¸ã®ãããªãã¼ã¸ã«ãªãã¨æãã¾ãã
ãGet started nowãããPipelineã®åæè¨å®ãã¼ã¸ã«é£ã³ã¾ããããã¨ã以ä¸ã®ãããªç»é¢ãåºã¦ãã¾ãã
ããã§ããSourceãã®ã¨ããããã以ä¸ã®ä¸ã¤ã®ãã¡ã®ã©ãããPipelineãä½æããããé¸ã³ã¾ãã
- Build using a templateï¼ãããããç¨æããã¦ããTemplateããä½æããå ´å
- Import a definitionï¼JSONã«ããå®ç¾©ãã¡ã¤ã«ããä½æããå ´å
- Build using Architectï¼AWSã®ã³ã³ã½ã¼ã«ããç·¨éããå ´å
ä»åã¯ããBuild using a templateãã«ä»åä½ããããã¤ãã©ã¤ã³ã®ãã®ãããã®ã§ãããã使ãã¾ã*3ãä»ã«ã¯ã以ä¸ã®ãããªTemplateãç¨æããã¦ãã¾ãã
ãã®templateã¨ä¼¼ã¦ãããã©ã¡ãã£ã¨éãã¨ãã§ããã°ãtemplateããä½ãã®ãæåã¯ããã¨æãã¾ã*4ãä½ã£ããã¤ãã©ã¤ã³ã®å®ç¾©ã¯ããArchitectãããExportãããã¨ãã§ããã®ã§ãã³ã¼ããã¼ã¹ã§å·®åã管çãããå ´åãªã©ã¯ããããå©ç¨ããã®ãããã¨æãã¾ããæ¢åã®ãã¤ãã©ã¤ã³ã«èªã¿è¾¼ã¿ãããããã¨ã¯ã§ããªãã¿ããã§ããããã
次ã«ãå®è¡ã®ã¹ã±ã¸ã¥ã¼ã«ããã°ãS3ã®ã©ã®ãã±ããã«ããããIAM Rolesã®è¨å®ããã¾ããå®è¡ç°å¢ã«ãã£ã¦ã¯ããã®è¨å®ãã¡ããã¨ããå¿ è¦ãããã®ã§ãããããã«ã¤ãã¦ã¯å°ãé·ããªãã®ã§å¥ã®è¨äºã§æ¸ãããã¨æãã¾ããããã¾ã§ãããããEdit in Architectããæ¼ãã¾ãã
ããã¨ã以ä¸ã®ãããªç»é¢ãåºã¦ãã¾ããå·¦å´ã«ãã¤ãã©ã¤ã³ã®åã³ã³ãã¼ãã³ãã¨ããã¼ãæãããå³å´ã«åã³ã³ãã¼ãã³ãã®è©³ç´°ãªè¨å®ãè¡ããã¨ãã§ããç»é¢ããããä¸ã«ãã¤ãã©ã¤ã³ã®è¨å®ã«é¢ããWarningãErrorãã§ãã³ã³ã½ã¼ã«ãããã¾ããåã³ã³ãã¼ãã³ãã®è©³ç´°ãè¨å®æ¹æ³ã«ã¤ãã¦ã¯ãä¸ã®ã使ãæ¹ã®åèãã®ã¨ããã«ããã¤ããªã³ã¯ãè²¼ã£ã¦ããã¾ãã®ã§ããã¡ãããåèãã ããã
ããããè¨å®ãã¦ããã¾ãã§ããã¨æã£ããããActivateããã¿ã³ãæ¼ãã¦ãå®éã«åããã¾ãããã¿ã³ãæ¼ãã¨ã以ä¸ã®ããã«ã©ã®Activityãä»ã©ã®ç¶æ ãããããç»é¢ã«é·ç§»ãã¾ããããã§ã¸ã§ãã®ç¶æ ããã£ã³ã»ã«ãåå®è¡ãªã©ããããã¨ãã§ãã¾ãã
使ãæ¹ã®åè
ã¨ããããããèªã
ã¡ãã£ã¨å¤ãã¦ãè¥å¹²UIãªã©ãéãã¾ãããã³ã³ã»ãããªã©ã¯å ¨é¨åãã§ãã
www.slideshare.net
Data Pipelineã®æ§æè¦ç´
ä¸ã®ãå®éã«ãã£ã¦ã¿ããã®ã¨ããã§ããã¤ãåºã¦ããæ§æè¦ç´ ã«ã¤ãã¦ã¯ã以ä¸ã®è¨äºãåèã«ãªãã¨æãã¾ãã
terraformã®å¯¾å¿ç¶æ³ã¨ã³ã¼ãã®ç®¡çæ¹æ³
terraformã使ã£ã¦ã¤ã³ãã©ã®ç®¡çããã¦ãã人ã¯ãã«ãªã対å¿ç¶æ³ã§ããã以ä¸ã®ãªã³ã¯å ã«ããããã«ãç¾æç¹ã§ã¯å¤å´ã®ç®±ã ãä½ããã¨ãã§ããããã§ãã
ãªã®ã§ãterraformã使ã£ã¦ç®±ã ãä½ã£ã¦ããã¨ã¯jsonã§pipelineã®ããã¼ã管çãã¤ã¤ãaws cliçµç±ã§pipelineã®ã¢ãããã¼ããããããã«ããããjsonã¨cliã ãã§ãããã®æããä»ã®ææãã¤ãã¦ãããã®ã§ãã
ã³ã¼ãããã¡ã¤ã«ããå®è¡ãããå ´åãªã©ã¯ãS3ã«ç½®ãã®ãdata pipelineã ã¨ã«ã¼ã«ãªã®ã§ãã³ã¼ãã¨ããjsonã¨ä¸ç·ã«ã»ããã§gitã¨ãã§ç®¡çãããããªã¤ã¡ã¼ã¸ã«ãªãã¨æãã¾ãã
ãã®ã»ãåè
åè
以ä¸ã«ãªãã¾ããããã§ã¯ã
ãGitãmacãæ°ããããã¨ãã«gitãã¤ã³ã¹ãã¼ã«ãã¦Githubã®è¨å®ãããæé
ããã«ã¡ã¯ã
ä¹ ãã¶ãã«æ°ããmacã§ä½æ¥ãå§ãã¦ããã®ã§ããããããæ©ã«åæã»ããã¢ããã®ã¨ãããã¾ã¨ãã¦ãããããªã¨æãã¾ãã
brewã®ã¤ã³ã¹ãã¼ã«
brewãã¾ãã¯å ¥ãã¾ããéä¸ã§Xcodeãã¤ã³ã¹ãã¼ã«ããã¢ãã¦ã³ã¹ãåºãã¨æãã¾ãããæ®éã«ãããã¤ã³ã¹ãã¼ã«ãã¡ããæãã§ã
$ ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
gitã®ã¤ã³ã¹ãã¼ã«
brewã§ã¤ã³ã¹ãã¼ã«ãã¾ãã
$ brew install git
確èª
以ä¸ã®ã³ãã³ãã§ç¢ºèªã§ãã¾ãã
$ git --version
Githubã®è¨å®
useråã¨emailãè¨å®ãã
$ git config --global user.name "<username>" $ git config --global user.email "<email>"
ssh-keyã®ä½æ
$ ssh-keygen -t rsa -b 4096 -C "<email>" $ chmod 600 ~/.ssh/id_rsa $ vim ~/.ssh/config ## 以ä¸ãã³ãã¼ãã¦è²¼ãä»ãã¦ä¿å Host github HostName github.com User git IdentityFile ~/.ssh/id_rsa
å ¬ééµãGithubã«ç»é²
æ®éã«GUIããããã
確èª
以ä¸ã®ã³ãã³ãã§ãæåããã£ã½ãã¡ãã»ã¼ã¸ãã§ãããªãã±ã¼ã
$ ssh github
ä¸ã®ä¸ã«ã¯ãã¡ããã¡ããã®æã®æ
å ±ã溢ãã¦ãããã©ãèªåç¨ã®ã¡ã¢ã以ä¸ã
ãGolangãAtCoderã®ç²¾é¸éå»å10åãã£ã¦ã¿ã
ããã«ã¡ã¯ã
ä»æ¥ã¯ãGoã®ç·´ç¿ãã¦ããAtCoderã®åé¡ã解ãã¦ã¿ã¾ããã以ä¸ã®è¨äºãèªãã§ãå ¥éè åãã®éå»åç²¾é¸10åããããã¨ãç¥ã£ãã®ã§ããããã¨ãã¾ããã
解ããã³ã¼ã
第0åï¼WelcometoAtCoder
package main import "fmt" func main() { var a, b, c int var s string fmt.Scan(&a) fmt.Scan(&b, &c) fmt.Scan(&s) sumResult := a + b + c fmt.Printf("%d %s\n", sumResult, s) }
第1åï¼Product
package main import "fmt" func main() { var a, b int fmt.Scan(&a, &b) if (a*b)%2 == 0 { fmt.Println("Even") } else { fmt.Println("Odd") } }
第2åï¼Placing Marbles
package main import "fmt" func main() { var s string fmt.Scan(&s) result := 0 for _, char := range s { if char == '1' { result = result + 1 } } fmt.Println(result) }
第3åï¼Shiftonly
package main import ( "bufio" "fmt" "os" "strconv" "strings" ) func convertStringListToIntList(data []string) []int { intList := make([]int, len(data)) for i, v := range data { intList[i], _ = strconv.Atoi(v) } return intList } func checkEven(data []int) bool { for _, d := range data { if d%2 == 1 { return false } } return true } func divEven(data []int) []int { for pos, d := range data { data[pos] = d / 2 } return data } func main() { var n int result := 0 fmt.Scan(&n) var sc = bufio.NewScanner(os.Stdin) sc.Scan() aString := sc.Text() aStringList := strings.Split(aString, " ") aIntList := convertStringListToIntList(aStringList) for { if !checkEven(aIntList) { break } result = result + 1 aIntList = divEven(aIntList) } fmt.Println(result) }
第4åï¼Coins
package main import "fmt" func main() { var fiveHundred, oneHundred, fifty, x int fmt.Scan(&fiveHundred) fmt.Scan(&oneHundred) fmt.Scan(&fifty) fmt.Scan(&x) result := 0 for a := 0; a <= fiveHundred; a++ { for b := 0; b <= oneHundred; b++ { for c := 0; c <= fifty; c++ { if 500*a+100*b+50*c == x { result = result + 1 } } } } fmt.Println(result) }
第5åï¼SomeSums
package main import ( "fmt" "strconv" "strings" ) func main() { var n, a, b, result int fmt.Scan(&n, &a, &b) for i := 1; i <= n; i++ { s := strconv.Itoa(i) tmp := 0 for _, v := range strings.Split(s, "") { iv, _ := strconv.Atoi(v) tmp = tmp + iv } if (tmp >= a) && (tmp <= b) { result = result + i } } fmt.Println(result) }
第6åï¼CardGameforTwo
package main import ( "bufio" "fmt" "os" "sort" "strconv" "strings" ) func convertStringListToIntList(data []string) []int { intList := make([]int, len(data)) for i, v := range data { intList[i], _ = strconv.Atoi(v) } return intList } func main() { var n int fmt.Scan(&n) var sc = bufio.NewScanner(os.Stdin) sc.Scan() aString := sc.Text() aStringList := strings.Split(aString, " ") aIntList := convertStringListToIntList(aStringList) sort.Sort(sort.Reverse(sort.IntSlice(aIntList))) aliceSum, bobSum := 0, 0 for i, a := range aIntList { if i%2 == 0 { aliceSum = aliceSum + a } else { bobSum = bobSum + a } } fmt.Println(aliceSum - bobSum) }
第7åï¼Kagami Mochi
package main import ( "fmt" "sort" ) func main() { var n int fmt.Scan(&n) dList := make([]int, n) for i := 0; i < n; i++ { fmt.Scanf("%d", &dList[i]) } sort.Sort(sort.IntSlice(dList)) result := 0 for i := 0; i < n; i++ { if i == 0 { result = result + 1 } else { if dList[i-1] != dList[i] { result = result + 1 } } } fmt.Println(result) }
第8åï¼Otoshidama
package main import "fmt" func existCandidate(n, total int) (int, int, int) { for x := 0; x <= n; x++ { for y := 0; y <= n; y++ { if total == x*10000+y*5000+(n-(x+y))*1000 && 0 <= n-(x+y) { return x, y, n - (x + y) } } } return -1, -1, -1 } func main() { var n, total int fmt.Scan(&n, &total) x, y, z := existCandidate(n, total) fmt.Println(x, y, z) }
第9åï¼白昼夢
package main import ( "fmt" "strings" ) func main() { var s string fmt.Scan(&s) s = strings.Replace(s, "dream", "D", -1) s = strings.Replace(s, "erase", "E", -1) s = strings.Replace(s, "Der", "", -1) s = strings.Replace(s, "Er", "", -1) s = strings.Replace(s, "D", "", -1) s = strings.Replace(s, "E", "", -1) s = strings.TrimSpace(s) if s == "" { fmt.Println("YES") } else { fmt.Println("NO") } }
第10åï¼Traveling
package main import ( "fmt" "math" ) func main() { preT, preX, preY := float64(0), float64(0), float64(0) var n int var t, x, y float64 fmt.Scan(&n) result := true for i := 0; i < n; i++ { fmt.Scanf("%f %f %f", &t, &x, &y) disT := math.Abs(float64(t - preT)) disX := math.Abs(float64(x - preX)) disY := math.Abs(float64(y - preY)) disXY := disX + disY if int(disT)%2 != 0 { if (disT < disXY) || (int(disXY)%2 == 0) { result = false } } else { if (disT < disXY) || (int(disXY)%2 == 1) { result = false } } preT = t preX = x preY = y } if result { fmt.Println("Yes") } else { fmt.Println("No") } }
以ä¸ã§ããä»å¾ãä»ã®åé¡ã解ãã¦ãããããããã§ã¯ã
Google Dmainsã使ã£ã¦ãã¡ã¤ã³ãåå¾ããã®ã§ã¾ã¨ãã
ããã«ã¡ã¯ã
æè¿ããã©ã¤ãã¼ãã§ã¢ããªéçºããã¦ã¿ããã¨æã£ã¦ããã¡ã¤ã³ãåãã¦Google Dmainsã使ã£ã¦åå¾ããã®ã§ããã®ããæ¹ãã¾ã¨ãã¾ãã
ä»ãFirebaseã§ã¢ããªã±ã¼ã·ã§ã³ã¯ãã¹ãã£ã³ã°ãã¦ãDNSã¯ä»å¾ã®ãã¨ãèãã¦GCPã§ãã£ã¦ããã®ã§ãã*1ãããã¨Google Dmainsãã©ããã£ã¦é£æºãã¦ããããå¥ã®è¨äºã§æ¸ãããã¨æãã¾ãã
Google Dmainsã¨ã¯
æ¬å®¶ã®ãµã¤ãã¯ã以ä¸ãGoogle Dmainsã¨ã¯ãGoogleãæä¾ãããã¡ã¤ã³ç»é²ãµã¼ãã¹ã®ãã¨ã§ããé¡ä¼¼ãµã¼ãã¹ã¨ãã¦ã¯ãããåå.comããªã©ãæåã ã¨æãã¾ãããç§ã使ã£ã¦ãã¾ãã
ããåå.comãã¨åæ§ã«ãå©ç¨å¯è½ãªãã¡ã¤ã³ã®æ¤ç´¢ã»è³¼å ¥ã»ç®¡çãªã©ãè¡ããã¨ãã§ãã¾ãã
Google Dmainsã®ã¡ãªããã¨ãã¦ã¯ãG Suiteã§ã¡ã¼ã«ã¢ãã¬ã¹ãä½ãããã¨ã«ãªãã¨æãã¾ã*2ã
GCPãªã©ã使ã£ã¦ããå ´åã«ã¯ãå ¨é¨Googleã®ãµã¼ãã¹ã§ç®¡çãããã¨ãã§ãã¦ä¾¿å©ã¨ããã®ãããã¾ãã
ãã¡ã¤ã³ã®åå¾æ¹æ³
Google Dmainsã使ãã«ã¯Googleã¢ã«ã¦ã³ããå¿ è¦ãªã®ã§ãããGoogleã¢ã«ã¦ã³ããæã£ã¦ããªãæ¹ã¯ãæ°è¦ã§çºè¡ãã¦ãã ãããæ°è¦ã§ãã¡ã¤ã³ãåå¾ããã«ã¯ãGoogleã«ãã°ã¤ã³ãã¦ãGoogle Dmainsã®ãµã¤ãã«è¡ãããã¡ã¤ã³ã®æ¤ç´¢ç»é¢ã«ãã¾ãã
ãã¡ã¤ã³ã®æ¤ç´¢çªã§ãèªåãåå¾ããããã¡ã¤ã³åãæ¤ç´¢ãã¾ãã
ããã¨ãå©ç¨å¯è½ãªãã¡ã¤ã³ãä¸è¦§ã§åºã¦ãã¾ãã®ã§ãã好ã¿ã®ãã¡ã¤ã³ãã¯ãªãã¯ãã¦ãã«ã¼ããã¿ã³ãæ¼ããã«ã¼ãã«å ¥ãã¾ãã
ããã¦ãã«ã¼ãã«å ¥ãããã¡ã¤ã³ã®è³¼å ¥æç¶ãã¸ã¨é²ã¿ã¾ããããã§ã追å ã§æéãæããã¨ã«ãã£ã¦ããã®ãã¡ã¤ã³ã使ã£ã¦G Suiteã®ã«ã¹ã¿ã ã¡ã¼ã«ãå©ç¨ãããã¨ãã§ãã¾ãã
以ä¸ã§ããã¡ã¤ã³ã®åå¾ãè¡ããã¨ãã§ãã¾ããç°¡åã§ãã
ãã®ä»ã«ããã«ã¹ã¿ã ãã¼ã ãµã¼ãã¼ãå©ç¨ãããã¨ãå¯è½ã§ãããããã«ã¤ãã¦ã¯å¥ã®è¨äºã§ç´¹ä»ãããã¨æãã¾ãã
åè
Data Portal(æ§Data Studio)ãçé¢ç®ã«è§¦ã£ã¦ã¿ãã®ã§ãåèã«ãªã£ãè¨äºã¨ãã¾ã¨ãã
ããã«ã¡ã¯ã
ããã¾ã§ãèªç¤¾ã§Redashãã¡ã¤ã³ã§ä½¿ã£ã¦ããã®ã§ãããGCPã®ç§»è¡ãé¨åçã«é²ãã¦ããã®ããã£ã¦ãData Portal(Data Studio)ã社å 使ã£ã¦ã¿ã¾ããã®ã§ããã®ã¨ãã«èª¿ã¹ãå 容ã¨ãããã£ããã¨ã¨ããä¸ãã¦ããããã¨æãã¾ãã
ã¾ã ã使ã£ã¦4æ¥ã¨ããã®ç¨åº¦ãªã®ã§ãééã£ã¦ããç®æã大ãã«ããã¨æãã¾ããããã®éã¯ã³ã¡ã³ãããã ãã¾ãã¨å¹¸ãã§ãã
æ¬å®¶ã®ããã¥ã¡ã³ã
Google Data Portalã¨ã¯
Google Data Portal*1ã¯ãç¡æã§å©ç¨ã§ããBIãã¼ã«ã§ããGoogleã¢ã«ã¦ã³ããããã°ãå©ç¨ãããã¨ãã§ãã¾ãã
Google DocumentãGoogle SpreadsheetãGoogle Slideãªã©ã¨åæ§ã«ãå ±åç·¨éæ©è½ãå ±ææ©è½ãæãã¦ãããã¾ãGoogle AnalyticsãBigQueryãªã©ãéãã¦ãæ§ã ãªãã¼ã¿ãåãè¾¼ã¿ãå¯è¦åã§ãã¾ãã
ã¾ããRedashãªã©ã¨éã£ã¦ãSQLãªã©ãåºæ¬çã«ã¯æ¸ãå¿ è¦ããªããããã°ã©ãã³ã°ä¸è¦ã§ä½¿ããã¨ãã§ããã®ã§ãã¨ã³ã¸ãã¢ä»¥å¤ã®ã¡ã³ãã¼ã®æ¹ã«ããã¼ãã«ãä½ããå°å ¥ãã¹ã ã¼ãºã«è¡ããã¨ãã§ããã¨æãã¾ãã
ã¨ãããã使ã£ã¦ã¿ã
ã¨ãããã使ã£ã¦ã¿ãããã«ãããã¤ãã®ããã¥ã¡ã³ããèªã¿ã¾ãããã以ä¸ã®é£è¼è¨äºãä¸çªããããããã£ãã§ãã
Data Studioã®ä½¿ãæ¹ãä¸éããããã¾ãããã®ä»ã«ãã以ä¸ãããã®è¨äºãåèã«ãªãã¾ããã
使ã£ã¦ã¿ã¦ã®ææ³
ã¯ããã¾ããããªã«å¤ãã®BIãã¼ã«ã使ã£ããã¨ããªãã¯ã¤ã®ææ³ã§ãããData Studioã¯ä¸é·æçãªéç¨ãç´°ããåæãè¡ãã«ã¯ãå°ãä¸ä¾¿ã ãªãã¨æãã¾ããã
(å¤å)使ã対象ã¨ãªãã¦ã¼ã¶ã¼*2
使ã対象ã«ãªãã¦ã¼ã¶ã¼ã¯ãä»ã®BIãã¼ã«ãããå°ãçãããâ¦ï¼ã¨æãã¾ããã大å ããã©ãã¨å½ããåã¨ããã°å½ããåãªã®ã§ãããGoogle analyticsãGoogle Adsenseã¨ãã£ããµã¼ãã¹ãæ´»ç¨ãã¦ããããã¼ã±ãã£ã³ã°ãåºåã®éç¨æ å½è ãªã©ã使ãã¨ãããã®ããªâ¦ï¼ã¨æãã¾ããã
éã«ãã¬ãªã¬ãªSQLãæ¸ãã¦è¡ããããªãã¼ã¿æ½åºãä¼´ããããªãã®ãªã©ã¯é£ããããªã¨æãã¾ãããä»åãä»äºã§ä½¿ã£ãã®ã¯ä»æ±ã£ã¦ãããµã¼ãã¹ã§ãAIã使ã£ã¦ãã¼ã¿ã解æããæ°å¤ã®å¾åã®ç£è¦ã®ããã ã£ãã®ã§ãç´°ããè¨å®ãå°ãè¤éãªéè¨ãè¡ãå¿ è¦ããã*3ãå°ãã¦ã¼ã¹ã±ã¼ã¹ã«ãã£ã¦ããªãã£ãã®ããªãã¨æãã¾ããã
使ã£ã¦ã¿ããã調ã¹ã¦ã¿ããããä¸ã§ã®ã¡ãªããã¨ãã¡ãªãããã¾ã¨ãã¦ã¿ã¾ãã
ã¡ãªãã
Googleã®è©ã®ä¸ã«ãã¾ãä¹ããã¨ãã§ããã¦ã¼ã¹ã±ã¼ã¹ã§ã¯ãã¡ãªããããã¾ã享åã§ãããªãã¨æãã¾ãããããã以å¤ã®ãã®ããç°¡åãªéè¨ã§ãã大人æ°ã§éç¨ããå ´åããä¸é·æçã«éç¨ãèãã¦ãããããªã±ã¼ã¹ã§ã¯ãä»ã®BIãã¼ã«ãæ¤è¨ããã»ããããããªã¨æãã¾ãã
- BI Engine*4ã®æ©æµãåãããã¨ãã§ãã
- æ§ã ãªãã¼ã¿ã½ã¼ã¹ã®è¿½å ãç°¡åã«ã§ããã¾ãGUIã§æä½ãã§ããã®ã§ãSQLãæ¸ããªã人ã使ããã¨ãã§ãã
- ãµã³ãã«ã§ç¨æããã¦ãããã³ãã¬ã¼ããè±å¯ã§ãRedashã§ã¯ãããä¸å¯è½ãªã¬ãã«ã®ã¬ãã¼ãã®ãã¶ã¤ã³ã®ã«ã¹ã¿ãã¤ãºãã§ãã*5
- Google analyticsãªã©ã®ãã¼ã¿ã®å ´åããã³ãã¬ã¼ããå å®ãã¦ããããã®ãã³ãã¬ã¼ãã使ã£ã¦åæããããã¨ãã§ãã
ãã¡ãªãã
ãã¯ãã½ãRedashãªãâ¦Redashãªãã§ããã®ã«â¦ï¼ãã£ã¦æã£ããã¨ãã¡ã¤ã³ã«ãªã£ã¦ãã¾ãã¾ããç¬
- ãã¼ã¿ã½ã¼ã¹ã¨ã¬ãã¼ãã§è¡ãå¯è¦åã®æ©è½ã®å¢çç·ãææ§ãªããããã¼ã¿ã½ã¼ã¹ã誤ã£ã¦ç·¨éãããããã¼ã¿ã®æå³ãªã©ãã¬ãã¼ãå é¨ã§å¤ãã£ã¦ãã¾ããã¾ãå½±é¿ç¯å²ãèªããªããªã
- ãã¼ã¿ã½ã¼ã¹ãã¡ããã¨ç®¡çãã¦ããªãã¨ããã¼ã¿ã½ã¼ã¹ã®ç®¡çãç ©éã«ãªã£ã¦ãã
- èªç±åº¦ãé«ããã¦ããã¼ã¿ãå¯è¦åããã¨ãã以å¤ã®ã¨ããã«å´åã使ã£ã¦ãã¾ã*6
- ã¬ãã¼ãã®ã¨ããã§ä»»æã«æ°ãããã£ã¼ã«ãã®å®ç¾©ãè¡ããã¨ãã§ããã®ã§ããã¼ã¿ã½ã¼ã¹
- BI EngineããViewãã«ã¹ã¿ã ã¯ã¨ãªã§ã¯ä½¿ããªããªãå ´åããã
使ãæ¹ã®é¢ã§åèã«ãªã£ãè¨äº
BI Engine
Googleãå»å¹´çºè¡¨ãããBI Engineã¨ãããµã¼ãã¹ã§ããBigQueryã®ãã¼ã¿ãä¸å®ããªã¥ã¼ã ã ããã£ãã·ã¥ãã¦ãããData Portalä¸ã§å©ç¨ããã¨ãã«é«éã«ãã¼ã¿ã«ã¢ã¯ã»ã¹ã§ããããã«ãããã®ã§ãã
詳ããã¯ä»¥ä¸ã®åç»ã¨ã
以ä¸ã®è¨äºã«æ¸ãã¦ããã¾ãã
BigQueryã§ã®ã«ã¹ã¿ã ã¯ã¨ãªã¼
BigQueryã¨ã®é£æºãä»åã¯ã¡ã¤ã³ã§ãã£ãã®ã§ãåå¿é²çã«æ¸ãã¦ããã¾ããèªåã§ã«ã¹ã¿ã ã§ã¯ã¨ãªã¼ãä½ã£ã¦ããã¼ã¿ãåå¾ãããã¨ãã§ãã¾ããDateãã«ã¹ã¿ã ã®ãã©ã¡ã¼ã¿ã¼ãã¯ã¨ãªã«æå®ãããã¨ãã§ãã¾ãã
URLã§ãã©ã¡ã¼ã¿ã¼ãæå®ãã
BIãã¼ã«ã¨ãã§ãDashboardã§ä½¿ç¨ããå ±éãã©ã¡ã¼ã¿ã¼ãURLã®ä¸ã§æå®ãããã¨ãã§ããããã¾ããèªåãè¦ããã©ã¡ã¼ã¿ã¼ãåºå®ã§ããå ´åããããããã¯ãã¼ã¯ãã¦ããã¨ä¾¿å©ã§ãããã ãData Portalã¯ããã©ã«ãã§ã¯URLãã¼ã¹ã§ãã©ã¡ã¼ã¿ã¼ãæå®ãããã¨ãã§ããªãã®ã§ã以ä¸ã®è¨äºã®ãããªæãã§è¨å®ããå¿ è¦ãããã¾ãã
ä»äºã§ã®æ´»ç¨
ããã¾ãçµç¹ãæ¥åã§Data Portalãéç¨ãã¦ããã®ãè¦ã¤ãããã¨ãã§ããªãã£ãã®ã§ããã以ä¸ã®è¨äºã¯åèã«ãªãã¾ããããã誰ãç¥ã£ã¦ãã人ããããæãã¦æ¬²ããã§ãã
ä»æ¥ã¯Data Portalã使ã£ã¦ã¿ãã¨ãã«åèã«ãªã£ããµã¤ãã¾ã¨ãã¨ãã¡ãªãããã¡ãªããã«ã¤ãã¦æ¸ãã¦ã¿ã¾ããã
ããã§ã¯ã
*1:æã¯ãGoogle Data Studioã¨ãªã£ã¦ãã¾ããããä»ã¯Google Data Portalã¨å¼ã°ãã¦ãã¾ã
*2:å¤åã¨è¨ã£ã¦ããã®ã¯ãã¾ã 使ãå§ãã¦ã¾ããªãããç¬
*3:æéåä½ã®è¨å®(5åééã¨ã)ãè¡ãå¿ è¦ããã
*4:ãã¨ã§åºã¦ãã¾ã
*5:ãã®ç¹ã¯ã絶対ã«Redashã§ã¯åã¦ãªãã¨ããã§ã¯ãããããã®åé¢èªç±åº¦ãé«ããã¦ããããç¨åº¦ãã®ã¬ãã¼ããä½ãã®ãå°ãè¦å´ãã¾ãã
*6:ããã¯å人ã®ææ³ã«è¿ãã§ããããããªãããã©ã³ãã®ãµã¤ãºã¨ããããã®ã§ããã®ã«ã¨ããæãç¬