refactor: Move RAG Service under Presets (kaito-project#715)

**Reason for Change**: Move RAG Service towards the other services - under Presets folder
bangqipropel · Nov 21, 2024 · 2d57916 · 2d57916
1 parent 9d72066
commit 2d57916
Show file tree

Hide file tree

Showing 130 changed files with 119 additions and 122 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -1 +0,0 @@
-presets/test/** linguist-vendored

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -42,13 +42,13 @@ updates:
       interval: daily
 
   - package-ecosystem: pip
-    directory: /presets/inference/text-generation
+    directory: /presets/workspace/inference/text-generation
     schedule:
       interval: daily
     open-pull-requests-limit: 0
 
   - package-ecosystem: pip
-    directory: /presets/tuning/tfs
+    directory: /presets/workspace/tuning/tfs
     schedule:
       interval: daily
     open-pull-requests-limit: 0
diff --git a/.github/workflows/e2e-preset-test.yml b/.github/workflows/e2e-preset-test.yml
@@ -209,7 +209,7 @@ jobs:
             fi
 
       - name: Create Service
-        run: kubectl apply -f presets/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}-service.yaml
+        run: kubectl apply -f presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}-service.yaml
 
       - name: Retrieve External Service IP
         id: get_ip
@@ -229,10 +229,10 @@ jobs:
       
       - name: Replace IP and Deploy Resource to K8s
         run: |
-            sed -i "s/MASTER_ADDR_HERE/${{ steps.get_ip.outputs.SERVICE_IP }}/g" presets/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml
-            sed -i "s/TAG_HERE/${{ matrix.model.tag }}/g" presets/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml
-            sed -i "s/REPO_HERE/${{ secrets.ACR_AMRT_USERNAME }}/g" presets/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml
-            kubectl apply -f presets/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml
+            sed -i "s/MASTER_ADDR_HERE/${{ steps.get_ip.outputs.SERVICE_IP }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml
+            sed -i "s/TAG_HERE/${{ matrix.model.tag }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml
+            sed -i "s/REPO_HERE/${{ secrets.ACR_AMRT_USERNAME }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml
+            kubectl apply -f presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml
 
       - name: Wait for Resource to be ready
         run: |

diff --git a/.github/workflows/kind-cluster/determine_models.py b/.github/workflows/kind-cluster/determine_models.py
@@ -17,7 +17,7 @@ def read_yaml(file_path):
         print(f"Error reading {file_path}: {e}")
         return None
 
-supp_models_yaml = 'presets/models/supported_models.yaml'
+supp_models_yaml = 'presets/workspace/models/supported_models.yaml'
 YAML_PR = read_yaml(supp_models_yaml)
 # Format: {falcon-7b : {model_name:falcon-7b, type:text-generation, version: #, tag: #}}
 MODELS = {model['name']: model for model in YAML_PR['models']}
@@ -85,7 +85,7 @@ def models_to_build(files_changed):
         models.update(detect_changes_in_yaml(yaml_main, YAML_PR))
     for model, model_info in MODELS.items():
         if model_info["type"] not in seen_model_types: 
-            if any(file.startswith(f'presets/inference/{model_info["type"]}') for file in files_changed):
+            if any(file.startswith(f'presets/workspace/inference/{model_info["type"]}') for file in files_changed):
                 models.add(model)
                 seen_model_types.add(model_info["type"])
     return list(models)

diff --git a/.github/workflows/kind-cluster/main.py b/.github/workflows/kind-cluster/main.py
@@ -13,7 +13,7 @@ def get_weights_path(model_name):
     return f"{WEIGHTS_FOLDER}/{model_name}/weights"
 
 def get_dockerfile_path(model_runtime):
-    return f"/kaito/docker/presets/models/{model_runtime}/Dockerfile"
+    return f"/kaito/docker/presets/workspace/models/{model_runtime}/Dockerfile"
 
 def generate_unique_id():
     """Generate a unique identifier for a job."""

diff --git a/.github/workflows/preset-image-build-1ES.yml b/.github/workflows/preset-image-build-1ES.yml
@@ -9,14 +9,14 @@ on:
     branches:
       - main
     paths:
-      - 'presets/inference/**'
-      - 'presets/models/supported_models.yaml'
+      - 'presets/workspace/inference/**'
+      - 'presets/workspace/models/supported_models.yaml'
   push:
     branches:
       - main
     paths:
-      - 'presets/inference/**'
-      - 'presets/models/supported_models.yaml'
+      - 'presets/workspace/inference/**'
+      - 'presets/workspace/models/supported_models.yaml'
   workflow_dispatch:
     inputs:
       force-run-all:

diff --git a/.github/workflows/preset-image-build.yml b/.github/workflows/preset-image-build.yml
@@ -9,14 +9,14 @@ on:
     branches:
       - main
     paths:
-      - 'presets/inference/**'
-      - 'presets/models/supported_models.yaml'
+      - 'presets/workspace/inference/**'
+      - 'presets/workspace/models/supported_models.yaml'
   push:
     branches:
       - main
     paths:
-      - 'presets/inference/**'
-      - 'presets/models/supported_models.yaml'
+      - 'presets/workspace/inference/**'
+      - 'presets/workspace/models/supported_models.yaml'
   workflow_dispatch:
     inputs:
       force-run-all:

diff --git a/Makefile b/Makefile
@@ -43,7 +43,7 @@ AZURE_KARPENTER_MSI_NAME ?= azkarpenterIdentity
 RUN_LLAMA_13B ?= false
 AI_MODELS_REGISTRY ?= modelregistry.azurecr.io
 AI_MODELS_REGISTRY_SECRET ?= modelregistry
-SUPPORTED_MODELS_YAML_PATH ?= ~/runner/_work/kaito/kaito/presets/models/supported_models.yaml
+SUPPORTED_MODELS_YAML_PATH ?= ~/runner/_work/kaito/kaito/presets/workspace/models/supported_models.yaml
 
 # Scripts
 GO_INSTALL := ./hack/go-install.sh
@@ -99,22 +99,22 @@ unit-test: ## Run unit tests.
 
 .PHONY: rag-service-test
 rag-service-test:
-	pip install -r pkg/ragengine/services/requirements.txt
-	pytest -o log_cli=true -o log_cli_level=INFO pkg/ragengine/services/tests
+	pip install -r presets/ragengine/requirements.txt
+	pytest -o log_cli=true -o log_cli_level=INFO presets/ragengine/tests
 
 .PHONY: tuning-metrics-server-test
 tuning-metrics-server-test:
-	pip install -r ./presets/dependencies/requirements-test.txt
-	pytest -o log_cli=true -o log_cli_level=INFO presets/tuning/text-generation/metrics
+	pip install -r ./presets/workspace/dependencies/requirements-test.txt
+	pytest -o log_cli=true -o log_cli_level=INFO presets/workspace/tuning/text-generation/metrics
 
 ## --------------------------------------
 ## E2E tests
 ## --------------------------------------
 
 inference-api-e2e:
-	pip install -r ./presets/dependencies/requirements-test.txt
-	pytest -o log_cli=true -o log_cli_level=INFO presets/inference/vllm
-	pytest -o log_cli=true -o log_cli_level=INFO presets/inference/text-generation
+	pip install -r ./presets/workspace/dependencies/requirements-test.txt
+	pytest -o log_cli=true -o log_cli_level=INFO presets/workspace/inference/vllm
+	pytest -o log_cli=true -o log_cli_level=INFO presets/workspace/inference/text-generation
 
 # Ginkgo configurations
 GINKGO_FOCUS ?=

diff --git a/charts/kaito/workspace/templates/lora-params.yaml b/charts/kaito/workspace/templates/lora-params.yaml
@@ -29,7 +29,7 @@ data:
       DataCollator: # Configurable Parameters: https://huggingface.co/docs/transformers/v4.40.2/en/main_classes/data_collator#transformers.DataCollatorForLanguageModeling
         mlm: true # Default setting; included to show DataCollator can be updated.
     
-      DatasetConfig: # Configurable Parameters: https://github.com/kaito-project/kaito/blob/main/presets/tuning/text-generation/cli.py#L44
+      DatasetConfig: # Configurable Parameters: https://github.com/kaito-project/kaito/blob/main/presets/workspace/tuning/text-generation/cli.py#L44
         shuffle_dataset: true
         train_test_split: 1 # Default to using all data for fine-tuning due to strong pre-trained baseline and typically limited fine-tuning data
         # Expected Dataset format:

diff --git a/charts/kaito/workspace/templates/qlora-params.yaml b/charts/kaito/workspace/templates/qlora-params.yaml
@@ -32,7 +32,7 @@ data:
       DataCollator: # Configurable Parameters: https://huggingface.co/docs/transformers/v4.40.2/en/main_classes/data_collator#transformers.DataCollatorForLanguageModeling
         mlm: true # Default setting; included to show DataCollator can be updated.
     
-      DatasetConfig: # Configurable Parameters: https://github.com/kaito-project/kaito/blob/main/presets/tuning/text-generation/cli.py#L44
+      DatasetConfig: # Configurable Parameters: https://github.com/kaito-project/kaito/blob/main/presets/workspace/tuning/text-generation/cli.py#L44
         shuffle_dataset: true
         train_test_split: 1 # Default to using all data for fine-tuning due to strong pre-trained baseline and typically limited fine-tuning data
         # Expected Dataset format:

diff --git a/cmd/workspace/models.go b/cmd/workspace/models.go
@@ -3,10 +3,10 @@
 package main
 
 import (
-	_ "github.com/kaito-project/kaito/presets/models/falcon"
-	_ "github.com/kaito-project/kaito/presets/models/llama2"
-	_ "github.com/kaito-project/kaito/presets/models/llama2chat"
-	_ "github.com/kaito-project/kaito/presets/models/mistral"
-	_ "github.com/kaito-project/kaito/presets/models/phi2"
-	_ "github.com/kaito-project/kaito/presets/models/phi3"
+	_ "github.com/kaito-project/kaito/presets/workspace/models/falcon"
+	_ "github.com/kaito-project/kaito/presets/workspace/models/llama2"
+	_ "github.com/kaito-project/kaito/presets/workspace/models/llama2chat"
+	_ "github.com/kaito-project/kaito/presets/workspace/models/mistral"
+	_ "github.com/kaito-project/kaito/presets/workspace/models/phi2"
+	_ "github.com/kaito-project/kaito/presets/workspace/models/phi3"
 )
diff --git a/docker/presets/models/llama-2/Dockerfile b/docker/presets/models/llama-2/Dockerfile
@@ -30,4 +30,4 @@ ARG VERSION
 RUN echo $VERSION > /workspace/llama/version.txt
 
 ADD ${WEIGHTS_PATH} /workspace/llama/llama-2/weights
-ADD presets/inference/${MODEL_TYPE} /workspace/llama/llama-2
+ADD presets/workspace/inference/${MODEL_TYPE} /workspace/llama/llama-2
diff --git a/docker/presets/models/tfs-onnx/Dockerfile b/docker/presets/models/tfs-onnx/Dockerfile
@@ -14,10 +14,10 @@ RUN echo $VERSION > /workspace/tfs/version.txt
 # First, copy just the requirements.txt file and install dependencies
 # This is done before copying the code to utilize Docker's layer caching and
 # avoid reinstalling dependencies unless the requirements file changes.
-COPY kaito/presets/inference/${MODEL_TYPE}/requirements.txt /workspace/tfs/requirements.txt
+COPY kaito/presets/workspace/inference/${MODEL_TYPE}/requirements.txt /workspace/tfs/requirements.txt
 RUN pip install --no-cache-dir -r requirements.txt
 
-COPY kaito/presets/inference/${MODEL_TYPE}/inference_api.py /workspace/tfs/inference_api.py
+COPY kaito/presets/workspace/inference/${MODEL_TYPE}/inference_api.py /workspace/tfs/inference_api.py
 
 # Convert to ONNX Runtime
 # RUN python convert_to_onnx.py ${MODEL_NAME} 

diff --git a/docker/presets/models/tfs/Dockerfile b/docker/presets/models/tfs/Dockerfile
@@ -7,24 +7,24 @@ ARG VERSION
 # Set the working directory
 WORKDIR /workspace
 
-COPY kaito/presets/dependencies/requirements.txt /workspace/requirements.txt
+COPY kaito/presets/workspace/dependencies/requirements.txt /workspace/requirements.txt
 
 RUN pip install --no-cache-dir -r /workspace/requirements.txt
 
 # 1. Huggingface transformers
-COPY kaito/presets/inference/${MODEL_TYPE}/inference_api.py \
-    kaito/presets/tuning/${MODEL_TYPE}/cli.py \
-    kaito/presets/tuning/${MODEL_TYPE}/fine_tuning.py \
-    kaito/presets/tuning/${MODEL_TYPE}/parser.py \
-    kaito/presets/tuning/${MODEL_TYPE}/dataset.py \
-    kaito/presets/tuning/${MODEL_TYPE}/metrics/metrics_server.py \
+COPY kaito/presets/workspace/inference/${MODEL_TYPE}/inference_api.py \
+    kaito/presets/workspace/tuning/${MODEL_TYPE}/cli.py \
+    kaito/presets/workspace/tuning/${MODEL_TYPE}/fine_tuning.py \
+    kaito/presets/workspace/tuning/${MODEL_TYPE}/parser.py \
+    kaito/presets/workspace/tuning/${MODEL_TYPE}/dataset.py \
+    kaito/presets/workspace/tuning/${MODEL_TYPE}/metrics/metrics_server.py \
     /workspace/tfs/
 
 # 2. vLLM
-COPY kaito/presets/inference/vllm/inference_api.py /workspace/vllm/inference_api.py
+COPY kaito/presets/workspace/inference/vllm/inference_api.py /workspace/vllm/inference_api.py
 
 # Chat template
-ADD kaito/presets/inference/chat_templates /workspace/chat_templates
+ADD kaito/presets/workspace/inference/chat_templates /workspace/chat_templates
 
 # Model weights
 COPY ${WEIGHTS_PATH} /workspace/weights

diff --git a/docker/ragengine/service/Dockerfile b/docker/ragengine/service/Dockerfile
@@ -3,7 +3,7 @@ FROM python:3.12-slim
 WORKDIR /app
 
 # Copy all files from ragengine/services into the app/services folder
-COPY pkg/ragengine/services/ services/
+COPY presets/ragengine/ services/
 
 # Set the PYTHONPATH environment variable
 ENV PYTHONPATH=/app

diff --git a/docs/How-to-add-new-models.md b/docs/How-to-add-new-models.md
@@ -9,7 +9,7 @@ This step is done by the requestor. The requestor should make a PR to describe t
 
 ## Step 2: Validate and test the model
 
-This step is done by Kaito maintainers. Based on the information provided in the proposal, Kaito maintainers will download the model and test it using the specified runtime. The entire process is automated via GitHub actions when Kaito maintainers file a PR to add the model to the [supported\_models.yaml](../presets/models/supported_models.yaml).
+This step is done by Kaito maintainers. Based on the information provided in the proposal, Kaito maintainers will download the model and test it using the specified runtime. The entire process is automated via GitHub actions when Kaito maintainers file a PR to add the model to the [supported\_models.yaml](../presets/workspace/models/supported_models.yaml).
 
 
 ## Step 3: Push model image to MCR
@@ -18,7 +18,7 @@ This step is done by Kaito maintainers. If the model license allows, Kaito maint
 
 ## Step 4: Add preset configurations
 
-This step is done by the requestor. The requestor will work on a PR to register the model with preset configurations. The PR will contain code changes to implement a simple inference interface. [Here](../presets/models/falcon/model.go) is an existing example. In the same PR, or a separate PR, the status of the proposal status should be updated to `integrated`.
+This step is done by the requestor. The requestor will work on a PR to register the model with preset configurations. The PR will contain code changes to implement a simple inference interface. [Here](../presets/workspace/models/falcon/model.go) is an existing example. In the same PR, or a separate PR, the status of the proposal status should be updated to `integrated`.
 
 ## Step 5: Add an E2E test
 

diff --git a/docs/custom-model-integration/Dockerfile.reference b/docs/custom-model-integration/Dockerfile.reference
@@ -16,7 +16,7 @@ RUN echo $VERSION > /workspace/tfs/version.txt
 # This is done before copying the code to utilize Docker's layer caching and
 # avoid reinstalling dependencies unless the requirements file changes.
 # Inference
-COPY presets/dependencies/requirements.txt /workspace/tfs/inference-requirements.txt
+COPY presets/workspace/dependencies/requirements.txt /workspace/tfs/inference-requirements.txt
 RUN pip install --no-cache-dir -r inference-requirements.txt
 
-COPY presets/inference/${MODEL_TYPE}/inference_api.py /workspace/tfs/inference_api.py
+COPY presets/workspace/inference/${MODEL_TYPE}/inference_api.py /workspace/tfs/inference_api.py
diff --git a/docs/custom-model-integration/custom-model-integration-guide.md b/docs/custom-model-integration/custom-model-integration-guide.md
@@ -72,7 +72,7 @@ export WEIGHTS_PATH="kaito/phi-3-mini-4k-instruct/weights"
 
 Navigate to the Kaito base directory and build the Docker image, ensuring the weights directory is included in the build context:
 ```sh
-docker build -t <IMAGE_NAME> --file docker/presets/models/tfs/Dockerfile --build-arg WEIGHTS_PATH=<WEIGHTS_PATH> --build-arg MODEL_TYPE=text-generation --build-arg VERSION=<VERSION> .
+docker build -t <IMAGE_NAME> --file docker/presets/workspace/models/tfs/Dockerfile --build-arg WEIGHTS_PATH=<WEIGHTS_PATH> --build-arg MODEL_TYPE=text-generation --build-arg VERSION=<VERSION> .
 
 docker push <IMAGE_NAME>
 ```

diff --git a/docs/inference/README.md b/docs/inference/README.md
@@ -80,7 +80,7 @@ When adapters are specified in the `inference` spec, the Kaito controller adds a
   <img src="../img/kaito-inference-adapter.png" width=40% title="Kaito inference adapter" alt="Kaito inference adapter">
 </div>
 
-If an image is specified as the adapter source, the corresponding initcontainer uses that image as its container image. These initcontainers ensure all adapter data is available locally before the inference service starts. The main container uses a supported model image, launching the [inference_api.py](../../presets/inference/text-generation/inference_api.py) script.
+If an image is specified as the adapter source, the corresponding initcontainer uses that image as its container image. These initcontainers ensure all adapter data is available locally before the inference service starts. The main container uses a supported model image, launching the [inference_api.py](../../presets/workspace/inference/text-generation/inference_api.py) script.
 
 All containers share local volumes by mounting the same `EmptyDir` volumes, avoiding file copies between containers.
 

diff --git a/docs/tuning/README.md b/docs/tuning/README.md
@@ -73,12 +73,14 @@ TrainingArguments([full list](https://huggingface.co/docs/transformers/v4.40.2/e
 DataCollator([full list](https://huggingface.co/docs/transformers/v4.40.2/en/main_classes/data_collator#transformers.DataCollatorForLanguageModeling))
 - mlm: Masked language modeling flag.
 
-DatasetConfig([full list](https://github.com/kaito-project/kaito/blob/main/presets/tuning/text-generation/cli.py#L44))
+DatasetConfig([full list](https://github.com/kaito-project/kaito/blob/main/presets/workspace/tuning/text-generation/cli.py#L44))
 - shuffle_dataset: Whether to shuffle the dataset.
 - train_test_split: Proportion of data used for training, typically set to 1 for using all data.
 
 ## Input dataset format
-The input dataset for fine tuning should follow specific formats defined in the huggingface trainer library. Supported formats include conversational and instruction formats.
+The input dataset for fine-tuning should follow specific formats defined in the HuggingFace trainer library. Supported formats include conversational and instruction formats.
+
+For example, [HuggingFace Dolly 15k OAI-style dataset](https://huggingface.co/datasets/philschmid/dolly-15k-oai-style/tree/main)
 
 - Conversational format
   ```json
@@ -90,18 +92,14 @@ The input dataset for fine tuning should follow specific formats defined in the
     ]
   }
   ```
-For example, [HuggingFace Dolly 15k OAI-style dataset](https://huggingface.co/datasets/philschmid/dolly-15k-oai-style/tree/main).
+
+For example, [HuggingFace Instruction Dataset](https://huggingface.co/datasets/HuggingFaceH4/instruction-dataset/tree/main)
 
 - Instruction format
   ```json
   {"prompt": "<prompt text>", "completion": "<ideal generated text>"}
-  {"prompt": "<prompt text>", "completion": "<ideal generated text>"}
-  {"prompt": "<prompt text>", "completion": "<ideal generated text>"}
   ```
 
-For example, [HuggingFace Instruction Dataset](https://huggingface.co/datasets/HuggingFaceH4/instruction-dataset/tree/main)
-
-
 If your dataset is not in one of these formats, it will be passed directly to the training library ([SFTTrainer](https://huggingface.co/docs/trl/en/sft_trainer)) without any preprocessing. This may result in undefined behavior if the dataset does not align with the trainer's expected input structure. To ensure proper functionality, you may need to preprocess the dataset to match one of the supported formats. For more details, please refer to this [documentation](https://huggingface.co/docs/trl/v0.9.4/sft_trainer#dataset-format-support).
 
 
@@ -118,7 +116,7 @@ Figure 1. Kaito tuning pod structure.
 
 - Sidecar container: It is introduced to support automatically pushing the tuning results to a container registry. This container, with `docker` installed, runs a script to periodically check the training progress. Once the training is done, indicated by a sentinel file created by the training process, the script builds a container image containing the training results and pushes the image to the specified container registry.
 
-- Main container: It uses one of the supported model images. The image entry launches the [fine\_tuning.py](https://github.com/kaito-project/kaito/blob/main/presets/tuning/text-generation/fine_tuning.py) script.
+- Main container: It uses one of the supported model images. The image entry launches the [fine\_tuning.py](https://github.com/kaito-project/kaito/blob/main/presets/workspace/tuning/text-generation/fine_tuning.py) script.
 
 All three containers use shared local volumes (by mounting the same `EmptyDir` volumes), hence file copies between containers are avoided.
 

diff --git a/pkg/ragengine/services/README.md b/pkg/ragengine/services/README.md