Skip to content

Commit

Permalink
Use Colab as a base image. (#1444)
Browse files Browse the repository at this point in the history
This change makes a number of major changes:
- Colab is the base image
- uv is the main package install tool
- leveraging requirements.txt instead of many separate installs
- stop building and installing tensorflow/torch/lightbgm/jax since those
are managed by the Colab base image now

In order to decide what packages to explicitly install I:
- looked at what packages are in the Colab base image
- looked at what packages were in the Kaggle image
- looked at what packages were explicitly mentioned in Kaggle Dockerfile

This may still take a few iterations to get all the right parts in the
image, but this should hopefully make the image much more manageable.

http://b/365782129
  • Loading branch information
djherbis authored Nov 27, 2024
1 parent 03c832e commit 66bac48
Show file tree
Hide file tree
Showing 9 changed files with 228 additions and 598 deletions.
594 changes: 75 additions & 519 deletions Dockerfile.tmpl

Large diffs are not rendered by default.

60 changes: 0 additions & 60 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -21,66 +21,6 @@ pipeline {
}

stages {
stage('Pre-build Packages from Source') {
parallel {
stage('torch') {
options {
timeout(time: 300, unit: 'MINUTES')
}
steps {
sh '''#!/bin/bash
set -exo pipefail
source config.txt
cd packages/
./build_package --base-image $BASE_IMAGE_REPO/$GPU_BASE_IMAGE_NAME:$BASE_IMAGE_TAG \
--package torch \
--version $TORCH_VERSION \
--build-arg TORCHAUDIO_VERSION=$TORCHAUDIO_VERSION \
--build-arg TORCHVISION_VERSION=$TORCHVISION_VERSION \
--build-arg CUDA_MAJOR_VERSION=$CUDA_MAJOR_VERSION \
--build-arg CUDA_MINOR_VERSION=$CUDA_MINOR_VERSION \
--push
'''
}
}
stage('lightgbm') {
options {
timeout(time: 10, unit: 'MINUTES')
}
steps {
sh '''#!/bin/bash
set -exo pipefail
source config.txt
cd packages/
./build_package --base-image $BASE_IMAGE_REPO/$GPU_BASE_IMAGE_NAME:$BASE_IMAGE_TAG \
--package lightgbm \
--version $LIGHTGBM_VERSION \
--build-arg CUDA_MAJOR_VERSION=$CUDA_MAJOR_VERSION \
--build-arg CUDA_MINOR_VERSION=$CUDA_MINOR_VERSION \
--push
'''
}
}
stage('jaxlib') {
options {
timeout(time: 300, unit: 'MINUTES')
}
steps {
sh '''#!/bin/bash
set -exo pipefail
source config.txt
cd packages/
./build_package --base-image $BASE_IMAGE_REPO/$GPU_BASE_IMAGE_NAME:$BASE_IMAGE_TAG \
--package jaxlib \
--version $JAX_VERSION \
--build-arg CUDA_MAJOR_VERSION=$CUDA_MAJOR_VERSION \
--build-arg CUDA_MINOR_VERSION=$CUDA_MINOR_VERSION \
--push
'''
}
}
}
}
stage('Build/Test/Diff') {
parallel {
stage('CPU') {
Expand Down
4 changes: 1 addition & 3 deletions clean-layer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,4 @@ apt-get clean
# Ensures the current working directory won't be deleted
cd /usr/local/src/
# Delete source files used for building binaries
rm -rf /usr/local/src/*
# Delete conda downloaded tarballs
conda clean -y --tarballs
rm -rf /usr/local/src/*
11 changes: 1 addition & 10 deletions config.txt
Original file line number Diff line number Diff line change
@@ -1,11 +1,2 @@
BASE_IMAGE_REPO=gcr.io/deeplearning-platform-release
BASE_IMAGE_TAG=m122
CPU_BASE_IMAGE_NAME=tf2-cpu.2-16.py310
GPU_BASE_IMAGE_NAME=tf2-gpu.2-16.py310
LIGHTGBM_VERSION=4.2.0
TORCH_VERSION=2.4.0
TORCHAUDIO_VERSION=2.4.0
TORCHVISION_VERSION=0.19.0
JAX_VERSION=0.4.26
CUDA_MAJOR_VERSION=12
CUDA_MINOR_VERSION=3
CUDA_MINOR_VERSION=2
139 changes: 139 additions & 0 deletions kaggle_requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
altair>=5.4.0
Babel
Boruta
Cartopy
ImageHash
Janome
PyArabic
PyUpSet
Pympler
Rtree
shapely<2
SimpleITK
TPOT
Theano
Wand
annoy
arrow
bayesian-optimization
boto3
catboost
category-encoders
cesium
comm
cytoolz
dask-expr
datasets
datashader
deap
dipy
docker
easyocr
eli5
emoji
fasttext
featuretools
fiona
fury
fuzzywuzzy
geojson
# geopandas > v0.14.4 breaks learn tools
geopandas==v0.14.4
google-cloud-aiplatform
# google-cloud-automl 2.0.0 introduced incompatible API changes, need to pin to 1.0.1
google-cloud-automl==1.0.1
# b/315753846: Unpin translate package.
google-cloud-translate==3.12.1
google-cloud-videointelligence
google-cloud-vision
gpxpy
h2o
haversine
hep-ml
igraph
ipympl
ipywidgets==8.1.5
isoweek
jedi
# b/276358430: fix Jupyter lsp freezing up the jupyter server
jupyter-lsp==1.5.1
# b/333854354: pin jupyter-server to version 2.12.5; later versions break LSP (b/333854354)
jupyter_server==2.12.5
jupyterlab
jupyterlab-lsp
kaggle-environments
kagglehub>=0.3.4
# Keras 3.6 broke test_keras.py > test_train > keras.datasets.mnist.load_data():
# See https://github.com/keras-team/keras/commit/dcefb139863505d166dd1325066f329b3033d45a
keras<3.6
keras-cv
keras-nlp
keras-tuner
kornia
langid
leven
# b/328788268: libpysal 4.10 seems to fail with "module 'shapely' has no attribute 'Geometry'. Did you mean: 'geometry'"
libpysal<=4.9.2
lime
line_profiler
mamba
mlcrate
mne
mpld3
nbdev
nilearn
olefile
onnx
openslide-bin
openslide-python
optuna
pandas-profiling
pandasql
papermill
path
path.py
pdf2image
plotly-express
preprocessing
pudb
pyLDAvis
pycryptodome
pydegensac
pydicom
pydub
pyemd
pyexcel-ods
pymc3
pymongo
pypdf
pytesseract
python-lsp-server
pytorch-ignite
pytorch-lightning
qgrid
qtconsole
ray
rgf-python
s3fs
scikit-learn-intelex
scikit-multilearn
scikit-optimize
scikit-plot
scikit-surprise
git+https://github.com/facebookresearch/segment-anything.git
shap
squarify
tensorflow-cloud
tensorflow-io
tensorflow-text
tensorflow_decision_forests
timm
torchinfo
torchmetrics
tsfresh
vtk
wandb
wavio
xgboost==2.0.3
xvfbwrapper
ydata-profiling
8 changes: 5 additions & 3 deletions test
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ set -e

IMAGE_TAG='kaggle/python-build'
IMAGE_TAG_OVERRIDE=''
ADDITONAL_OPTS=''
ADDITONAL_OPTS='--runtime runc ' # Use the CPU runtime by default
PATTERN='test*.py'

usage() {
Expand Down Expand Up @@ -69,8 +69,6 @@ readonly ADDITONAL_OPTS
readonly PATTERN

set -x
docker run --rm --net=none -v /tmp/python-build:/tmp/python-build "$IMAGE_TAG" rm -rf /tmp/python-build/*
docker rm jupyter_test || true
mkdir -p /tmp/python-build/tmp
mkdir -p /tmp/python-build/devshm
mkdir -p /tmp/python-build/working
Expand All @@ -97,6 +95,9 @@ fi
# Note about `--hostname localhost` (b/158137436)
# hostname defaults to the container name which fails DNS name
# resolution with --net=none (required to keep tests hermetic). See details in bug.
#
# Note about CLOUDSDK_CONFIG=/tmp/.config/gcloud
# We use the /tmp dir since the filesystem is --read-only and we need writable space for gcloud configs.
docker run --rm -t --read-only --net=none \
-e HOME=/tmp -e KAGGLE_DATA_PROXY_TOKEN=test-key \
-e KAGGLE_USER_SECRETS_TOKEN_KEY=test-secrets-key \
Expand All @@ -105,6 +106,7 @@ docker run --rm -t --read-only --net=none \
-e KAGGLE_DATA_PROXY_PROJECT=test \
-e TF_FORCE_GPU_ALLOW_GROWTH=true \
-e XLA_PYTHON_CLIENT_PREALLOCATE=false \
-e CLOUDSDK_CONFIG=/tmp/.config/gcloud \
--hostname localhost \
--shm-size=2g \
-v $PWD:/input:ro -v /tmp/python-build/working:/working \
Expand Down
1 change: 1 addition & 0 deletions tests/test_cuml.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
class TestCuml(unittest.TestCase):
@gpu_test
@p100_exempt # b/342143152: cuML(>=24.4v) is inompatible with p100 GPUs.
@unittest.skip("b/381287748 cuML is not installed in Colab.")
def test_pca_fit_transform(self):
import unittest
import numpy as np
Expand Down
7 changes: 4 additions & 3 deletions tests/test_fastai.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,9 @@ def test_tabular(self):
"/input/tests/data/train.csv",
cont_names=["pixel"+str(i) for i in range(784)],
y_names='label',
procs=[FillMissing, Categorify, Normalize])
procs=[FillMissing, Categorify, Normalize])
learn = tabular_learner(dls, layers=[200, 100])
learn.fit_one_cycle(n_epoch=1)
with learn.no_bar():
learn.fit_one_cycle(n_epoch=1)

self.assertGreater(learn.smooth_loss, 0)
self.assertGreater(learn.smooth_loss, 0)
2 changes: 2 additions & 0 deletions tests/test_lightgbm.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,9 @@ def test_cpu(self):

self.assertEqual(1, gbm.best_iteration)

# TODO(b/381256047): Colab needs to install GPU-enabled lightgbm.
@gpu_test
@unittest.skip("Skipping this test until b/381256047 is resolved.")
def test_gpu(self):
lgb_train, lgb_eval = self.load_datasets()

Expand Down

0 comments on commit 66bac48

Please sign in to comment.