Cleanup useless files for each image layers

Kaggle · Nov 30, 2018 · eb3f09f · eb3f09f
1 parent 604efce
commit eb3f09f
Show file tree

Hide file tree

Showing 3 changed files with 65 additions and 43 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,29 +1,33 @@
 FROM gcr.io/kaggle-images/python-tensorflow-whl:1.11.0-py36 as tensorflow_whl
 FROM continuumio/anaconda3:5.2.0
 
-# This is necessary for apt to access HTTPS sources
-RUN apt-get update && \
-    apt-get install apt-transport-https
-
+ADD clean-layer.sh  /tmp/clean-layer.sh
 ADD patches/ /tmp/patches/
 ADD patches/nbconvert-extensions.tpl /opt/kaggle/nbconvert-extensions.tpl
 
+# This is necessary for apt to access HTTPS sources
+RUN apt-get update && \
+    apt-get install apt-transport-https && \
+    /tmp/clean-layer.sh
+
     # Use a fixed apt-get repo to stop intermittent failures due to flaky httpredir connections,
     # as described by Lionel Chan at http://stackoverflow.com/a/37426929/5881346
 RUN sed -i "s/httpredir.debian.org/debian.uchicago.edu/" /etc/apt/sources.list && \
     apt-get update && apt-get install -y build-essential unzip && \
     # https://stackoverflow.com/a/46498173
     conda update -y conda && conda update -y python && \
     pip install --upgrade pip && \
-    apt-get -y install cmake
+    apt-get -y install cmake && \
+    /tmp/clean-layer.sh
 
 # Tensorflow doesn't support python 3.7 yet. See https://github.com/tensorflow/tensorflow/issues/20517
 # Fix to install tf 1.10:: Downgrade python 3.7->3.6.6 and downgrade Pandas 0.23.3->0.23.2
 RUN conda install -y python=3.6.6 && \
     pip install pandas==0.23.2 && \
     # Another fix for TF 1.10 https://github.com/tensorflow/tensorflow/issues/21518
     pip install keras_applications==1.0.4 --no-deps && \
-    pip install keras_preprocessing==1.0.2 --no-deps
+    pip install keras_preprocessing==1.0.2 --no-deps && \
+    /tmp/clean-layer.sh
 
 # The anaconda base image includes outdated versions of these packages. Update them to include the latest version.
 RUN pip install --upgrade seaborn python-dateutil dask && \
@@ -43,13 +47,13 @@ RUN pip install --upgrade seaborn python-dateutil dask && \
     wget https://imagemagick.org/download/ImageMagick.tar.gz && \
     tar xzf ImageMagick.tar.gz && cd `ls -d ImageMagick-*` && pwd && ls -al && ./configure && \
     make -j $(nproc) && make install && \
-    # clean up ImageMagick source files
-    cd ../ && rm -rf ImageMagick*
+    /tmp/clean-layer.sh
 
 # Install tensorflow from a pre-built wheel
 COPY --from=tensorflow_whl /tmp/tensorflow_cpu/*.whl /tmp/tensorflow_cpu/
 RUN pip install /tmp/tensorflow_cpu/tensorflow*.whl && \
-    rm -rf /tmp/tensorflow_cpu
+    rm -rf /tmp/tensorflow_cpu && \
+    /tmp/clean-layer.sh
 
 RUN apt-get install -y libfreetype6-dev && \
     apt-get install -y libglib2.0-0 libxext6 libsm6 libxrender1 libfontconfig1 --fix-missing && \
@@ -112,10 +116,7 @@ RUN apt-get install -y libfreetype6-dev && \
     vader_lexicon verbnet webtext word2vec_sample wordnet wordnet_ic words ycoe && \
     # Stop-words
     pip install stop-words && \
-    # clean up
-    rm -rf /root/.cache/pip/* && \
-    apt-get autoremove -y && apt-get clean && \
-    rm -rf /usr/local/src/*
+    /tmp/clean-layer.sh
 
 # Make sure the dynamic linker finds the right libstdc++
 ENV LD_LIBRARY_PATH=/opt/conda/lib
@@ -128,10 +129,9 @@ RUN apt-get -y install zlib1g-dev liblcms2-dev libwebp-dev libgeos-dev && \
     cd basemap && \
     git checkout v1.1.0 && \
     python setup.py install && \
-    pip install basemap --no-binary basemap
-
-# sasl is apparently an ibis dependency
-RUN apt-get -y install libsasl2-dev && \
+    pip install basemap --no-binary basemap && \
+    # sasl is apparently an ibis dependency
+    apt-get -y install libsasl2-dev && \
     # ...as is psycopg2
     apt-get install -y libpq-dev && \
     pip install ibis-framework && \
@@ -162,7 +162,8 @@ RUN apt-get -y install libsasl2-dev && \
     # Re-run it to flush any more disk writes
     python -c "from keras.models import Sequential; from keras import backend; print(backend._BACKEND)" && \
     # Keras reverts to /tmp from ~ when it detects a read-only file system
-    mkdir -p /tmp/.keras && cp /root/.keras/keras.json /tmp/.keras
+    mkdir -p /tmp/.keras && cp /root/.keras/keras.json /tmp/.keras && \
+    /tmp/clean-layer.sh
 
     # scikit-learn dependencies
 RUN pip install scipy && \
@@ -204,21 +205,15 @@ RUN pip install scipy && \
     apt-get install -y sox libsox-dev libsox-fmt-all && \
     pip install cffi && \
     cd /usr/local/src && git clone https://github.com/pytorch/audio && cd audio && python setup.py install && \
-    # ~~~~ CLEAN UP ~~~~
-    rm -rf /root/.cache/pip/* && \
-    apt-get autoremove -y && apt-get clean && \
-    conda clean -i -l -t -y && \
-    rm -rf /usr/local/src/*
+    /tmp/clean-layer.sh
 
 # vtk with dependencies
 RUN apt-get install -y libgl1-mesa-glx && \
     pip install vtk && \
     # xvfbwrapper with dependencies
     apt-get install -y xvfb && \
     pip install xvfbwrapper && \
-    # ~~~~ CLEAN UP ~~~~
-    rm -rf /root/.cache/pip/* && \
-    apt-get autoremove -y && apt-get clean
+    /tmp/clean-layer.sh
 
 RUN pip install --upgrade mpld3 && \
     pip install mplleaflet && \
@@ -268,7 +263,8 @@ RUN pip install --upgrade mpld3 && \
     pip install pystan && \
     pip install ImageHash && \
     conda install -y ecos && \
-    conda install -y CVXcanon
+    conda install -y CVXcanon && \
+    /tmp/clean-layer.sh
 
 RUN pip install fancyimpute && \
     pip install git+https://github.com/pymc-devs/pymc3 && \
@@ -324,7 +320,8 @@ RUN pip install fancyimpute && \
     pip install geoplot && \
     pip install eli5 && \
     pip install implicit && \
-    pip install dask-ml[xgboost]
+    pip install dask-ml[xgboost] && \
+    /tmp/clean-layer.sh
 
 RUN pip install kmeans-smote --no-dependencies && \
     # Add google PAIR-code Facets
@@ -343,7 +340,8 @@ RUN pip install kmeans-smote --no-dependencies && \
     pip install cufflinks && \
     pip install glmnet_py && \
     pip install lime && \
-    pip install memory_profiler
+    pip install memory_profiler && \
+    /tmp/clean-layer.sh
 
 # install cython & cysignals before pyfasttext
 RUN pip install --upgrade cython && \
@@ -374,9 +372,8 @@ RUN pip install --upgrade cython && \
     pip install mlcrate && \
     # Required to display Altair charts in Jupyter notebook
     pip install vega3 && \
-    jupyter nbextension install --sys-prefix --py vega3  && \
-    # clean up pip cache
-    rm -rf /root/.cache/pip/*
+    jupyter nbextension install --sys-prefix --py vega3 && \
+    /tmp/clean-layer.sh
 
 # Fast.ai and dependencies
 RUN pip install bcolz && \
@@ -438,9 +435,7 @@ RUN pip install bcolz && \
     # which downgrades pytorch. fastai does work with pytorch 0.4.
     pip install fastai==0.7.0 --no-deps && \
     pip install torchtext && \
-    # clean up pip cache
-    rm -rf /root/.cache/pip/* && \
-    cd && rm -rf /usr/local/src/*
+    /tmp/clean-layer.sh
 
     ###########
     #
@@ -476,16 +471,15 @@ RUN pip install flashtext && \
     pip install ggplot && \
     pip install cesium && \
     pip install rgf_python && \
-    ##### ^^^^ Add new contributions above here ^^^^ #####
-    # clean up pip cache
-    rm -rf /root/.cache/pip/*
+    /tmp/clean-layer.sh
 
 # Pin Vowpal Wabbit v8.6.0 because 8.6.1 does not build or install successfully
 RUN cd /usr/local/src && \
     git clone -b 8.6.0 https://github.com/JohnLangford/vowpal_wabbit.git && \
     ./vowpal_wabbit/python/conda_install.sh && \
     # Reinstall in non-editable mode (without the -e flag)
-    pip install vowpal_wabbit/python
+    pip install vowpal_wabbit/python && \
+    /tmp/clean-layer.sh
 
 # For Facets
 ENV PYTHONPATH=$PYTHONPATH:/opt/facets/facets_overview/python/
@@ -501,7 +495,8 @@ RUN pip install --upgrade dask && \
     # Stop Matplotlib printing junk to the console on first load
     sed -i "s/^.*Matplotlib is building the font cache using fc-list.*$/# Warning removed by Kaggle/g" /opt/conda/lib/python3.6/site-packages/matplotlib/font_manager.py && \
     # Make matplotlib output in Jupyter notebooks display correctly
-    mkdir -p /etc/ipython/ && echo "c = get_config(); c.IPKernelApp.matplotlib = 'inline'" > /etc/ipython/ipython_config.py
+    mkdir -p /etc/ipython/ && echo "c = get_config(); c.IPKernelApp.matplotlib = 'inline'" > /etc/ipython/ipython_config.py && \
+    /tmp/clean-layer.sh
 
 # Add BigQuery client proxy settings
 ENV PYTHONUSERBASE "/root/.local"

diff --git a/clean-layer.sh b/clean-layer.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+#
+# This scripts should be called at the end of each RUN command
+# in the Dockerfiles.
+#
+# Each RUN command creates a new layer that is stored separately.
+# At the end of each command, we should ensure we clean up downloaded
+# archives and source files used to produce binary to reduce the size
+# of the layer.
+set -e
+set -x
+
+# Delete files that pip caches when installing a package.
+rm -rf /root/.cache/pip/*
+# Delete old downloaded archive files 
+apt-get autoremove -y
+# Delete downloaded archive files
+apt-get clean
+# Delete source files used for building binaries
+rm -rf /usr/local/src/*
+# Delete conda downloaded tarballs
+conda clean -y --tarballs
diff --git a/gpu.Dockerfile b/gpu.Dockerfile
@@ -2,6 +2,8 @@ FROM nvidia/cuda:9.1-cudnn7-devel-ubuntu16.04 AS nvidia
 FROM gcr.io/kaggle-images/python-tensorflow-whl:1.11.0-py36 as tensorflow_whl
 FROM gcr.io/kaggle-images/python:staging
 
+ADD clean-layer.sh  /tmp/clean-layer.sh
+
 # Cuda support
 COPY --from=nvidia /etc/apt/sources.list.d/cuda.list /etc/apt/sources.list.d/
 COPY --from=nvidia /etc/apt/sources.list.d/nvidia-ml.list /etc/apt/sources.list.d/
@@ -36,7 +38,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
       libnccl2=2.2.12-1+cuda9.1 \
       libnccl-dev=2.2.12-1+cuda9.1 && \
     ln -s /usr/local/cuda-9.1 /usr/local/cuda && \
-    ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
+    ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 && \
+    /tmp/clean-layer.sh
 
 # Reinstall packages with a separate version for GPU support
 # Tensorflow
@@ -45,7 +48,9 @@ RUN pip uninstall -y tensorflow && \
     pip install /tmp/tensorflow_gpu/tensorflow*.whl && \
     rm -rf /tmp/tensorflow_gpu && \
     conda uninstall -y pytorch-cpu torchvision-cpu && \
-    conda install -y pytorch torchvision -c pytorch
+    conda install -y pytorch torchvision -c pytorch && \
+    /tmp/clean-layer.sh
 
 # Install GPU-only packages
-RUN pip install pycuda
+RUN pip install pycuda && \
+    /tmp/clean-layer.sh