Misc

hupe1980 · Apr 30, 2024 · 9a59a6a · 9a59a6a
1 parent 2a93b7e
commit 9a59a6a
Show file tree

Hide file tree

Showing 19 changed files with 114 additions and 42 deletions.
diff --git a/aisploit/classifiers/amazon/comprehend.py b/aisploit/classifiers/amazon/comprehend.py
@@ -30,7 +30,7 @@ class ComprehendPIIClassifier(BaseComprehendClassifier[List[Any]]):
     tags: List[str] = field(default_factory=lambda: ["leakage"], init=False)
 
     def score(
-        self, input: str, _references: List[str] | None = None, _metadata: Dict[str, Any] | None = None
+        self, input: str, references: List[str] | None = None, metadata: Dict[str, Any] | None = None
     ) -> Score[List[Any]]:
         """Score the input for PII using Amazon Comprehend.
 
@@ -67,7 +67,7 @@ class ComprehendToxicityClassifier(BaseComprehendClassifier[Dict[str, Any]]):
     tags: List[str] = field(default_factory=lambda: ["toxicity"], init=False)
 
     def score(
-        self, input: str, _references: List[str] | None = None, _metadata: Dict[str, Any] | None = None
+        self, input: str, references: List[str] | None = None, metadata: Dict[str, Any] | None = None
     ) -> Score[Dict[str, Any]]:
         """Score the input for toxicity using Amazon Comprehend.
 

diff --git a/aisploit/classifiers/huggingface/bert_score.py b/aisploit/classifiers/huggingface/bert_score.py
@@ -15,7 +15,7 @@ class BertScoreClassifier(BaseTextClassifier[Dict[str, Any]]):
     bertscore: evaluate.EvaluationModule = field(default_factory=lambda: evaluate.load("bertscore"), init=False)
 
     def score(
-        self, input: str, references: List[str] | None = None, _metadata: Dict[str, Any] | None = None
+        self, input: str, references: List[str] | None = None, metadata: Dict[str, Any] | None = None
     ) -> Score[Dict[str, Any]]:
         """Score the input using BERTScore computed by the evaluate module.
 

diff --git a/aisploit/classifiers/huggingface/bleu.py b/aisploit/classifiers/huggingface/bleu.py
@@ -14,7 +14,7 @@ class BleuClassifier(BaseTextClassifier[Dict[str, Any]]):
     bleu: evaluate.EvaluationModule = field(default_factory=lambda: evaluate.load("bleu"), init=False)
 
     def score(
-        self, input: str, references: List[str] | None = None, _metadata: Dict[str, Any] | None = None
+        self, input: str, references: List[str] | None = None, metadata: Dict[str, Any] | None = None
     ) -> Score[Dict[str, Any]]:
         """Score the input using BLEU score computed by the evaluate module.
 

diff --git a/aisploit/classifiers/huggingface/pipeline_prompt_injection.py b/aisploit/classifiers/huggingface/pipeline_prompt_injection.py
@@ -32,7 +32,7 @@ def __init__(
         self._threshold = threshold
 
     def score(
-        self, input: str, _references: List[str] | None = None, _metadata: Dict[str, Any] | None = None
+        self, input: str, references: List[str] | None = None, metadata: Dict[str, Any] | None = None
     ) -> Score[float]:
         result = self._model(input)
 

diff --git a/aisploit/classifiers/markdown.py b/aisploit/classifiers/markdown.py
@@ -8,7 +8,7 @@ class MarkdownInjectionClassifier(BaseTextClassifier[List[Any]]):
     """A text classifier to detect Markdown injection in input text."""
 
     def score(
-        self, input: str, _references: List[str] | None = None, _metadata: Dict[str, Any] | None = None
+        self, input: str, references: List[str] | None = None, metadata: Dict[str, Any] | None = None
     ) -> Score[List[Any]]:
         # !\[.*?\]\((.*?)\) - This is for the inline image format in Markdown, which is ![alt_text](url).
         # !\[.*?\]\[(.*?)\] - This is for the reference-style image format in Markdown, which is ![alt_text][image_reference].

diff --git a/aisploit/classifiers/openai/moderation.py b/aisploit/classifiers/openai/moderation.py
@@ -21,7 +21,7 @@ def __init__(
         self._client = OpenAI(api_key=api_key)
 
     def score(
-        self, input: str, _references: List[str] | None = None, _metadata: Dict[str, Any] | None = None
+        self, input: str, references: List[str] | None = None, metadata: Dict[str, Any] | None = None
     ) -> Score[Moderation]:
         """Score the input using the OpenAI Moderations API.
 

diff --git a/aisploit/classifiers/package_hallucination.py b/aisploit/classifiers/package_hallucination.py
@@ -21,7 +21,7 @@ def __post_init__(self) -> None:
         self.libraries = stdlib_list(self.python_version)
 
     def score(
-        self, input: str, _references: List[str] | None = None, _metadata: Dict[str, Any] | None = None
+        self, input: str, references: List[str] | None = None, metadata: Dict[str, Any] | None = None
     ) -> Score[List[str]]:
         """
         Scores the input based on the presence of hallucinated Python package names.

diff --git a/aisploit/classifiers/presidio/presidio_analyser.py b/aisploit/classifiers/presidio/presidio_analyser.py
@@ -26,7 +26,7 @@ def __post_init__(self) -> None:
             self._analyzer.registry.add_recognizer(recognizer=recognizer)
 
     def score(
-        self, input: str, _references: List[str] | None = None, _metadata: Dict[str, Any] | None = None
+        self, input: str, references: List[str] | None = None, metadata: Dict[str, Any] | None = None
     ) -> Score[List[RecognizerResult]]:
         """Score the input text for Personally Identifiable Information (PII) entities.
 

diff --git a/aisploit/classifiers/repeated_token.py b/aisploit/classifiers/repeated_token.py
@@ -8,7 +8,7 @@
 @dataclass
 class RepeatedTokenClassifier(BaseTextClassifier[str]):
     def score(
-        self, input: str, _references: List[str] | None = None, metadata: Dict[str, Any] | None = None
+        self, input: str, references: List[str] | None = None, metadata: Dict[str, Any] | None = None
     ) -> Score[str]:
         if not metadata:
             raise ValueError("metadata  is missing")

diff --git a/aisploit/classifiers/self_similarity.py b/aisploit/classifiers/self_similarity.py
@@ -18,7 +18,7 @@ class SelfSimilarityClassifier(BaseTextClassifier[Dict[str, Any]]):
     tags: List[str] = field(default_factory=lambda: ["hallucination"], init=False)
 
     def score(
-        self, input: str, references: List[str] | None = None, _metadata: Dict[str, Any] | None = None
+        self, input: str, references: List[str] | None = None, metadata: Dict[str, Any] | None = None
     ) -> Score[Dict[str, Any]]:
         """Score the input text based on its self-similarity to reference texts.
 

diff --git a/aisploit/classifiers/text.py b/aisploit/classifiers/text.py
@@ -19,7 +19,7 @@ def __init__(self, *, pattern: re.Pattern, flag_matches=True) -> None:
         self._flag_matches = flag_matches
 
     def score(
-        self, input: str, _references: List[str] | None = None, _metadata: Dict[str, Any] | None = None
+        self, input: str, references: List[str] | None = None, metadata: Dict[str, Any] | None = None
     ) -> Score[bool]:
         """Score the input based on the regular expression pattern.
 
@@ -65,7 +65,7 @@ class TextTokenClassifier(BaseTextClassifier[bool]):
     token: str
 
     def score(
-        self, input: str, _references: List[str] | None = None, _metadata: Dict[str, Any] | None = None
+        self, input: str, references: List[str] | None = None, metadata: Dict[str, Any] | None = None
     ) -> Score[bool]:
         return Score[bool](
             flagged=self.token in input,

diff --git a/aisploit/core/__init__.py b/aisploit/core/__init__.py
@@ -1,7 +1,7 @@
 from .callbacks import BaseCallbackHandler, CallbackManager, Callbacks
 from .classifier import BaseClassifier, BaseTextClassifier, Score
 from .converter import BaseChatModelConverter, BaseConverter
-from .dataset import BaseDataset, YamlDeserializable
+from .dataset import BaseDataset, DataclassDataset, TabularDataset, YamlDeserializable
 from .generator import BaseGenerator
 from .job import BaseJob
 from .model import BaseChatModel, BaseEmbeddings, BaseLLM, BaseModel
@@ -20,6 +20,8 @@
     "BaseConverter",
     "BaseChatModelConverter",
     "BaseDataset",
+    "DataclassDataset",
+    "TabularDataset",
     "YamlDeserializable",
     "BaseGenerator",
     "BaseJob",

diff --git a/aisploit/core/dataset.py b/aisploit/core/dataset.py
@@ -1,14 +1,28 @@
+from abc import ABC, abstractmethod
 from pathlib import Path
 from typing import Generic, Sequence, Type, TypeVar
 
 import yaml
+from pandas import DataFrame
 
 T = TypeVar("T")
 
 
-class BaseDataset(Generic[T]):
+class BaseDataset(ABC):
     """Generic dataset class."""
 
+    @abstractmethod
+    def __iter__(self):
+        pass
+
+    @abstractmethod
+    def __len__(self):
+        pass
+
+
+class DataclassDataset(BaseDataset, Generic[T]):
+    """Dataset class based on dataclasses."""
+
     _entries: Sequence[T]
 
     def __iter__(self):
@@ -18,6 +32,19 @@ def __len__(self):
         return len(self._entries)
 
 
+class TabularDataset(BaseDataset):
+    """Dataset class for tabular data."""
+
+    _df: DataFrame
+
+    def __iter__(self):
+        for row in self._df.values.tolist():
+            yield row
+
+    def __len__(self):
+        return len(self._df)
+
+
 class YamlDeserializable:
     """Base class for objects that can be deserialized from YAML."""
 

diff --git a/aisploit/datasets/prompt.py b/aisploit/datasets/prompt.py
@@ -3,7 +3,7 @@
 from pathlib import Path
 from typing import Optional, Sequence
 
-from ..core.dataset import BaseDataset, YamlDeserializable
+from ..core.dataset import DataclassDataset, YamlDeserializable
 
 
 @dataclass
@@ -19,7 +19,7 @@ class Prompt(YamlDeserializable):
     template: str
 
 
-class PromptDataset(BaseDataset[Prompt]):
+class PromptDataset(DataclassDataset[Prompt]):
     """Dataset for prompts."""
 
     def __init__(self, prompts: Sequence[Prompt]) -> None:

diff --git a/aisploit/datasets/sample.py b/aisploit/datasets/sample.py
@@ -3,7 +3,7 @@
 from pathlib import Path
 from typing import Optional, Sequence
 
-from ..core.dataset import BaseDataset, YamlDeserializable
+from ..core.dataset import DataclassDataset, YamlDeserializable
 
 
 @dataclass
@@ -17,7 +17,7 @@ class Sample(YamlDeserializable):
     tags: Sequence[str]
 
 
-class SampleDataset(BaseDataset[Sample]):
+class SampleDataset(DataclassDataset[Sample]):
     """Dataset for samples."""
 
     def __init__(self, samples: Sequence[Sample]) -> None:

diff --git a/aisploit/generators/auto_jailbreak.py b/aisploit/generators/auto_jailbreak.py
@@ -6,7 +6,7 @@
 from langchain_core.prompts.prompt import PromptTemplate
 from langchain_core.pydantic_v1 import BaseModel, Field
 
-from ..core import BaseChatModel, BaseDataset, BaseGenerator
+from ..core import BaseChatModel, BaseGenerator, DataclassDataset
 
 DEFAULT_PATTERNS = [
     "Responsibility: Invent a context that appeals to the target LLM's responsibility to act.",
@@ -46,7 +46,7 @@ class AutoJailbreak(BaseModel):
     value: str = Field(description="the new prompt")
 
 
-class AutoJailbreakDataset(BaseDataset[AutoJailbreak]):
+class AutoJailbreakDataset(DataclassDataset[AutoJailbreak]):
     def __init__(self, prompts: Sequence[AutoJailbreak]) -> None:
         self._entries = prompts
 

diff --git a/aisploit/generators/poison.py b/aisploit/generators/poison.py
@@ -5,12 +5,24 @@
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.prompts.prompt import PromptTemplate
 
-from ..core import BaseChatModel, BaseDataset, BaseEmbeddings, BaseGenerator
+from ..core import BaseChatModel, BaseEmbeddings, BaseGenerator, DataclassDataset
 from ..utils import cosine_distance
 
 
 @dataclass
 class Poison:
+    """
+    A class representing a poisoned input for testing language models.
+
+    Attributes:
+        question (str): The question to be asked.
+        question_embeddings (List[float]): The embeddings of the question.
+        target_answer (str): The desired target answer.
+        adversary_text (str): The adversarial text generated to elicit the target answer.
+        adversary_text_embeddings (List[float]): The embeddings of the adversarial text.
+        cosine_distance (float): The cosine distance between the question and adversarial text embeddings.
+    """
+
     question: str
     question_embeddings: List[float]
     target_answer: str
@@ -31,12 +43,20 @@ class Poison:
 )
 
 
-class PoisonDataset(BaseDataset[Poison]):
+class PoisonDataset(DataclassDataset[Poison]):
+    """
+    A dataset of poisoned inputs for testing language models.
+    """
+
     def __init__(self, poisons: Sequence[Poison]) -> None:
         self._entries = poisons
 
 
 class PoisonGenerator(BaseGenerator[Poison]):
+    """
+    A generator for creating poisoned inputs for testing language models.
+    """
+
     def __init__(
         self,
         *,
@@ -48,6 +68,18 @@ def __init__(
         max_words=30,
         max_iterations=10,
     ) -> None:
+        """
+        Initialize the PoisonGenerator.
+
+        Args:
+            question (str): The question to be asked.
+            answer (str): The desired target answer.
+            chat_model (BaseChatModel): The chat model to be used for generating adversarial text.
+            embeddings (BaseEmbeddings): The embeddings model to be used for calculating cosine distances.
+            prompt (PromptTemplate, optional): The prompt template to be used for generating adversarial text. Defaults to _template.
+            max_words (int, optional): The maximum number of words allowed in the adversarial text. Defaults to 30.
+            max_iterations (int, optional): The maximum number of iterations to try generating adversarial text. Defaults to 10.
+        """
         self._question = question
         self._answer = answer
         self._chain = prompt | chat_model | StrOutputParser()
@@ -56,6 +88,12 @@ def __init__(
         self._max_iterations = max_iterations
 
     def generate(self) -> Generator[Poison, Any, None]:
+        """
+        Generate poisoned inputs for testing language models.
+
+        Yields:
+            Poison: A poisoned input for testing language models.
+        """
         question_embeddings = self._embeddings.embed_query(self._question)
         for _ in range(self._max_iterations):
             adversary_text = self._chain.invoke(
@@ -78,4 +116,10 @@ def generate(self) -> Generator[Poison, Any, None]:
             )
 
     def generate_dataset(self) -> PoisonDataset:
+        """
+        Generate a dataset of poisoned inputs for testing language models.
+
+        Returns:
+            PoisonDataset: A dataset of poisoned inputs for testing language models.
+        """
         return PoisonDataset(list(self.generate()))
diff --git a/aisploit/scanner/templates/report.md b/aisploit/scanner/templates/report.md
@@ -13,7 +13,7 @@ No issues!
 | Prompt | Converter | Response | RTT (seconds) |
 |--------|-----------|----------|---------------|
 {% for issue in issues -%}
-|{{ issue.send_report_entry.prompt }}|{{ issue.send_report_entry.converter }}|{{ issue.send_report_entry.response }}| {{ issue.send_report_entry.round_trip_time }} |
+|{{ issue.send_report_entry.prompt_value }}|{{ issue.send_report_entry.converter }}|{{ issue.send_report_entry.response }}| {{ issue.send_report_entry.round_trip_time }} |
 {% endfor %}
 {% endfor %}
 {% endif %}