Dockerfile.tmpl

FROM us-docker.pkg.dev/colab-images/public/runtime:latest

ADD kaggle_requirements.txt /kaggle_requirements.txt

# Freeze existing requirements from base image for critical packages:
RUN pip freeze | grep -E 'tensorflow|keras|torch|jax|lightgbm' > /colab_requirements.txt

# Merge requirements files:
RUN cat /colab_requirements.txt >> /requirements.txt
RUN cat /kaggle_requirements.txt >> /requirements.txt

# TODO: GPU requirements.txt
# TODO: merge them better (override matching ones).

# Install uv & Kaggle packages
RUN curl -LsSf https://astral.sh/uv/install.sh | sh
RUN export PATH="${HOME}/.local/bin:${PATH}" && uv pip install --system -r /requirements.txt
ENV PATH="~/.local/bin:${PATH}"

# Install manual packages:
# b/183041606#comment5: the Kaggle data proxy doesn't support these APIs. If the library is missing, it falls back to using a regular BigQuery query to fetch data.
RUN uv pip uninstall --system google-cloud-bigquery-storage

# uv cannot install this in requirements.txt without --no-build-isolation
# to avoid affecting the larger build, we'll post-install it.
RUN uv pip install --no-build-isolation --system "git+https://github.com/Kaggle/learntools"

# b/328788268 We install an incompatible pair of libs (shapely<2, libpysal==4.9.2) so we can't put this one in the requirements.txt
RUN uv pip install --system  "libpysal==4.9.2"

# Adding non-package dependencies:

ADD clean-layer.sh  /tmp/clean-layer.sh
ADD patches/nbconvert-extensions.tpl /opt/kaggle/nbconvert-extensions.tpl
ADD patches/template_conf.json /opt/kaggle/conf.json

# /opt/conda/lib/python3.10/site-packages
ARG PACKAGE_PATH=/usr/local/lib/python3.10/dist-packages

# Install GPU-specific non-pip packages.
{{ if eq .Accelerator "gpu" }}
ARG CUDA_MAJOR_VERSION \
    CUDA_MINOR_VERSION
ENV CUDA_MAJOR_VERSION=${CUDA_MAJOR_VERSION} \
    CUDA_MINOR_VERSION=${CUDA_MINOR_VERSION}

# Make sure we are on the right version of CUDA
RUN update-alternatives --set cuda /usr/local/cuda-$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION

RUN uv pip install --system "pycuda"

# Remove CUDA_VERSION from non-GPU image.
{{ else }}
ENV CUDA_VERSION=""
{{ end }}


# Update GPG key per documentation at https://cloud.google.com/compute/docs/troubleshooting/known-issues
RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add -
RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key --keyring /usr/share/keyrings/cloud.google.gpg add -

# Use a fixed apt-get repo to stop intermittent failures due to flaky httpredir connections,
# as described by Lionel Chan at http://stackoverflow.com/a/37426929/5881346
RUN sed -i "s/httpredir.debian.org/debian.uchicago.edu/" /etc/apt/sources.list && \
    apt-get update --allow-releaseinfo-change && \
    # Needed by lightGBM (GPU build)
    # https://lightgbm.readthedocs.io/en/latest/GPU-Tutorial.html#build-lightgbm
    apt-get install -y build-essential unzip cmake libboost-dev libboost-system-dev libboost-filesystem-dev p7zip-full && \
    # b/182601974: ssh client was removed from the base image but is required for packages such as stable-baselines.
    apt-get install -y openssh-client && \
    apt-get install -y graphviz && pip install graphviz && \
    /tmp/clean-layer.sh

ADD patches/keras_internal.py \
    patches/keras_internal_test.py \
    $PACKAGE_PATH/tensorflow_decision_forests/keras/

RUN apt-get install -y libfreetype6-dev && \
    apt-get install -y libglib2.0-0 libxext6 libsm6 libxrender1 libfontconfig1 --fix-missing

# NLTK Project datasets
RUN mkdir -p /usr/share/nltk_data && \
    # NLTK Downloader no longer continues smoothly after an error, so we explicitly list
    # the corpuses that work
    python -m nltk.downloader -d /usr/share/nltk_data abc alpino averaged_perceptron_tagger \
    basque_grammars biocreative_ppi bllip_wsj_no_aux \
    book_grammars brown brown_tei cess_cat cess_esp chat80 city_database cmudict \
    comtrans conll2000 conll2002 conll2007 crubadan dependency_treebank \
    europarl_raw floresta gazetteers genesis gutenberg \
    ieer inaugural indian jeita kimmo knbc large_grammars lin_thesaurus mac_morpho machado \
    masc_tagged maxent_ne_chunker maxent_treebank_pos_tagger moses_sample movie_reviews \
    mte_teip5 names nps_chat omw opinion_lexicon paradigms \
    pil pl196x porter_test ppattach problem_reports product_reviews_1 product_reviews_2 propbank \
    pros_cons ptb punkt qc reuters rslp rte sample_grammars semcor senseval sentence_polarity \
    sentiwordnet shakespeare sinica_treebank smultron snowball_data spanish_grammars \
    state_union stopwords subjectivity swadesh switchboard tagsets timit toolbox treebank \
    twitter_samples udhr2 udhr unicode_samples universal_tagset universal_treebanks_v20 \
    vader_lexicon verbnet webtext word2vec_sample wordnet wordnet_ic words ycoe

RUN apt-get install -y git-lfs && \
    # vtk dependencies
    apt-get install -y libgl1-mesa-glx && \
    # xvfbwrapper dependencies
    apt-get install -y xvfb && \
    /tmp/clean-layer.sh

# Download base easyocr models.
# https://github.com/JaidedAI/EasyOCR#usage
RUN mkdir -p /root/.EasyOCR/model && \
    wget --no-verbose "https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/latin_g2.zip" -O /root/.EasyOCR/model/latin.zip && \
    unzip /root/.EasyOCR/model/latin.zip -d /root/.EasyOCR/model/ && \
    rm /root/.EasyOCR/model/latin.zip && \
    wget --no-verbose "https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/english_g2.zip" -O /root/.EasyOCR/model/english.zip && \
    unzip /root/.EasyOCR/model/english.zip -d /root/.EasyOCR/model/ && \
    rm /root/.EasyOCR/model/english.zip && \
    wget --no-verbose "https://github.com/JaidedAI/EasyOCR/releases/download/pre-v1.1.6/craft_mlt_25k.zip" -O /root/.EasyOCR/model/craft_mlt_25k.zip && \
    unzip /root/.EasyOCR/model/craft_mlt_25k.zip -d /root/.EasyOCR/model/ && \
    rm /root/.EasyOCR/model/craft_mlt_25k.zip && \
    /tmp/clean-layer.sh

# Tesseract and some associated utility packages
RUN apt-get install tesseract-ocr -y

ENV TESSERACT_PATH=/usr/bin/tesseract \
    # For Facets, we also include an empty path to include $PWD.
    PYTHONPATH=:$PYTHONPATH:/opt/facets/facets_overview/python/ \
    # For Theano with MKL
    MKL_THREADING_LAYER=GNU

# Temporary fixes and patches
# Stop jupyter nbconvert trying to rewrite its folder hierarchy
RUN mkdir -p /root/.jupyter && touch /root/.jupyter/jupyter_nbconvert_config.py && touch /root/.jupyter/migrated && \
    mkdir -p /.jupyter && touch /.jupyter/jupyter_nbconvert_config.py && touch /.jupyter/migrated && \
    # Make matplotlib output in Jupyter notebooks display correctly
    mkdir -p /etc/ipython/ && echo "c = get_config(); c.IPKernelApp.matplotlib = 'inline'" > /etc/ipython/ipython_config.py && \
    /tmp/clean-layer.sh

# Fix to import bq_helper library without downgrading setuptools
RUN mkdir -p ~/src && git clone https://github.com/SohierDane/BigQuery_Helper ~/src/BigQuery_Helper && \
    mkdir -p ~/src/BigQuery_Helper/bq_helper && \
    mv ~/src/BigQuery_Helper/bq_helper.py ~/src/BigQuery_Helper/bq_helper/__init__.py && \
    mv ~/src/BigQuery_Helper/test_helper.py ~/src/BigQuery_Helper/bq_helper/ && \
    sed -i 's/)/packages=["bq_helper"])/g' ~/src/BigQuery_Helper/setup.py && \
    uv pip install --system -e ~/src/BigQuery_Helper && \
    /tmp/clean-layer.sh


# install imagemagick for wand
# https://docs.wand-py.org/en/latest/guide/install.html#install-imagemagick-on-debian-ubuntu
RUN apt-get install libmagickwand-dev

# Override default imagemagick policies
ADD patches/imagemagick-policy.xml /etc/ImageMagick-6/policy.xml

# Add Kaggle module resolver
ADD patches/kaggle_module_resolver.py $PACKAGE_PATH/tensorflow_hub/kaggle_module_resolver.py
RUN sed -i '/from tensorflow_hub import uncompressed_module_resolver/a from tensorflow_hub import kaggle_module_resolver' $PACKAGE_PATH/tensorflow_hub/config.py && \
    sed -i '/_install_default_resolvers()/a \ \ registry.resolver.add_implementation(kaggle_module_resolver.KaggleFileResolver())' $PACKAGE_PATH/tensorflow_hub/config.py

# Add BigQuery client proxy settings
ENV PYTHONUSERBASE="/root/.local"
ADD patches/kaggle_gcp.py \
    patches/kaggle_secrets.py \
    patches/kaggle_session.py \
    patches/kaggle_web_client.py \ 
    patches/kaggle_datasets.py \
    patches/log.py \
    $PACKAGE_PATH/

# Figure out why this is in a different place?
# Found by doing a export PYTHONVERBOSE=1 and then running python and checking for where it looked for it.
ADD patches/sitecustomize.py /usr/lib/python3.10/sitecustomize.py

ARG GIT_COMMIT=unknown \
    BUILD_DATE=unknown

LABEL git-commit=$GIT_COMMIT \
    build-date=$BUILD_DATE

ENV GIT_COMMIT=${GIT_COMMIT} \
    BUILD_DATE=${BUILD_DATE}

# Correlate current release with the git hash inside the kernel editor by running `!cat /etc/git_commit`.
RUN echo "$GIT_COMMIT" > /etc/git_commit && echo "$BUILD_DATE" > /etc/build_date

{{ if eq .Accelerator "gpu" }}
# Remove the CUDA stubs.
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH_NO_STUBS" \
    # Add the CUDA home.
    CUDA_HOME=/usr/local/cuda
{{ end }}
ENTRYPOINT ["/usr/bin/env"]