Skip to content

Commit

Permalink
Misc
Browse files Browse the repository at this point in the history
  • Loading branch information
hupe1980 committed Apr 30, 2024
1 parent 2a93b7e commit 9a59a6a
Show file tree
Hide file tree
Showing 19 changed files with 114 additions and 42 deletions.
4 changes: 2 additions & 2 deletions aisploit/classifiers/amazon/comprehend.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ class ComprehendPIIClassifier(BaseComprehendClassifier[List[Any]]):
tags: List[str] = field(default_factory=lambda: ["leakage"], init=False)

def score(
self, input: str, _references: List[str] | None = None, _metadata: Dict[str, Any] | None = None
self, input: str, references: List[str] | None = None, metadata: Dict[str, Any] | None = None
) -> Score[List[Any]]:
"""Score the input for PII using Amazon Comprehend.
Expand Down Expand Up @@ -67,7 +67,7 @@ class ComprehendToxicityClassifier(BaseComprehendClassifier[Dict[str, Any]]):
tags: List[str] = field(default_factory=lambda: ["toxicity"], init=False)

def score(
self, input: str, _references: List[str] | None = None, _metadata: Dict[str, Any] | None = None
self, input: str, references: List[str] | None = None, metadata: Dict[str, Any] | None = None
) -> Score[Dict[str, Any]]:
"""Score the input for toxicity using Amazon Comprehend.
Expand Down
2 changes: 1 addition & 1 deletion aisploit/classifiers/huggingface/bert_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class BertScoreClassifier(BaseTextClassifier[Dict[str, Any]]):
bertscore: evaluate.EvaluationModule = field(default_factory=lambda: evaluate.load("bertscore"), init=False)

def score(
self, input: str, references: List[str] | None = None, _metadata: Dict[str, Any] | None = None
self, input: str, references: List[str] | None = None, metadata: Dict[str, Any] | None = None
) -> Score[Dict[str, Any]]:
"""Score the input using BERTScore computed by the evaluate module.
Expand Down
2 changes: 1 addition & 1 deletion aisploit/classifiers/huggingface/bleu.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ class BleuClassifier(BaseTextClassifier[Dict[str, Any]]):
bleu: evaluate.EvaluationModule = field(default_factory=lambda: evaluate.load("bleu"), init=False)

def score(
self, input: str, references: List[str] | None = None, _metadata: Dict[str, Any] | None = None
self, input: str, references: List[str] | None = None, metadata: Dict[str, Any] | None = None
) -> Score[Dict[str, Any]]:
"""Score the input using BLEU score computed by the evaluate module.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def __init__(
self._threshold = threshold

def score(
self, input: str, _references: List[str] | None = None, _metadata: Dict[str, Any] | None = None
self, input: str, references: List[str] | None = None, metadata: Dict[str, Any] | None = None
) -> Score[float]:
result = self._model(input)

Expand Down
2 changes: 1 addition & 1 deletion aisploit/classifiers/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ class MarkdownInjectionClassifier(BaseTextClassifier[List[Any]]):
"""A text classifier to detect Markdown injection in input text."""

def score(
self, input: str, _references: List[str] | None = None, _metadata: Dict[str, Any] | None = None
self, input: str, references: List[str] | None = None, metadata: Dict[str, Any] | None = None
) -> Score[List[Any]]:
# !\[.*?\]\((.*?)\) - This is for the inline image format in Markdown, which is ![alt_text](url).
# !\[.*?\]\[(.*?)\] - This is for the reference-style image format in Markdown, which is ![alt_text][image_reference].
Expand Down
2 changes: 1 addition & 1 deletion aisploit/classifiers/openai/moderation.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def __init__(
self._client = OpenAI(api_key=api_key)

def score(
self, input: str, _references: List[str] | None = None, _metadata: Dict[str, Any] | None = None
self, input: str, references: List[str] | None = None, metadata: Dict[str, Any] | None = None
) -> Score[Moderation]:
"""Score the input using the OpenAI Moderations API.
Expand Down
2 changes: 1 addition & 1 deletion aisploit/classifiers/package_hallucination.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def __post_init__(self) -> None:
self.libraries = stdlib_list(self.python_version)

def score(
self, input: str, _references: List[str] | None = None, _metadata: Dict[str, Any] | None = None
self, input: str, references: List[str] | None = None, metadata: Dict[str, Any] | None = None
) -> Score[List[str]]:
"""
Scores the input based on the presence of hallucinated Python package names.
Expand Down
2 changes: 1 addition & 1 deletion aisploit/classifiers/presidio/presidio_analyser.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def __post_init__(self) -> None:
self._analyzer.registry.add_recognizer(recognizer=recognizer)

def score(
self, input: str, _references: List[str] | None = None, _metadata: Dict[str, Any] | None = None
self, input: str, references: List[str] | None = None, metadata: Dict[str, Any] | None = None
) -> Score[List[RecognizerResult]]:
"""Score the input text for Personally Identifiable Information (PII) entities.
Expand Down
2 changes: 1 addition & 1 deletion aisploit/classifiers/repeated_token.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
@dataclass
class RepeatedTokenClassifier(BaseTextClassifier[str]):
def score(
self, input: str, _references: List[str] | None = None, metadata: Dict[str, Any] | None = None
self, input: str, references: List[str] | None = None, metadata: Dict[str, Any] | None = None
) -> Score[str]:
if not metadata:
raise ValueError("metadata is missing")
Expand Down
2 changes: 1 addition & 1 deletion aisploit/classifiers/self_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class SelfSimilarityClassifier(BaseTextClassifier[Dict[str, Any]]):
tags: List[str] = field(default_factory=lambda: ["hallucination"], init=False)

def score(
self, input: str, references: List[str] | None = None, _metadata: Dict[str, Any] | None = None
self, input: str, references: List[str] | None = None, metadata: Dict[str, Any] | None = None
) -> Score[Dict[str, Any]]:
"""Score the input text based on its self-similarity to reference texts.
Expand Down
4 changes: 2 additions & 2 deletions aisploit/classifiers/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def __init__(self, *, pattern: re.Pattern, flag_matches=True) -> None:
self._flag_matches = flag_matches

def score(
self, input: str, _references: List[str] | None = None, _metadata: Dict[str, Any] | None = None
self, input: str, references: List[str] | None = None, metadata: Dict[str, Any] | None = None
) -> Score[bool]:
"""Score the input based on the regular expression pattern.
Expand Down Expand Up @@ -65,7 +65,7 @@ class TextTokenClassifier(BaseTextClassifier[bool]):
token: str

def score(
self, input: str, _references: List[str] | None = None, _metadata: Dict[str, Any] | None = None
self, input: str, references: List[str] | None = None, metadata: Dict[str, Any] | None = None
) -> Score[bool]:
return Score[bool](
flagged=self.token in input,
Expand Down
4 changes: 3 additions & 1 deletion aisploit/core/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from .callbacks import BaseCallbackHandler, CallbackManager, Callbacks
from .classifier import BaseClassifier, BaseTextClassifier, Score
from .converter import BaseChatModelConverter, BaseConverter
from .dataset import BaseDataset, YamlDeserializable
from .dataset import BaseDataset, DataclassDataset, TabularDataset, YamlDeserializable
from .generator import BaseGenerator
from .job import BaseJob
from .model import BaseChatModel, BaseEmbeddings, BaseLLM, BaseModel
Expand All @@ -20,6 +20,8 @@
"BaseConverter",
"BaseChatModelConverter",
"BaseDataset",
"DataclassDataset",
"TabularDataset",
"YamlDeserializable",
"BaseGenerator",
"BaseJob",
Expand Down
29 changes: 28 additions & 1 deletion aisploit/core/dataset.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,28 @@
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Generic, Sequence, Type, TypeVar

import yaml
from pandas import DataFrame

T = TypeVar("T")


class BaseDataset(Generic[T]):
class BaseDataset(ABC):
"""Generic dataset class."""

@abstractmethod
def __iter__(self):
pass

@abstractmethod
def __len__(self):
pass


class DataclassDataset(BaseDataset, Generic[T]):
"""Dataset class based on dataclasses."""

_entries: Sequence[T]

def __iter__(self):
Expand All @@ -18,6 +32,19 @@ def __len__(self):
return len(self._entries)


class TabularDataset(BaseDataset):
"""Dataset class for tabular data."""

_df: DataFrame

def __iter__(self):
for row in self._df.values.tolist():
yield row

def __len__(self):
return len(self._df)


class YamlDeserializable:
"""Base class for objects that can be deserialized from YAML."""

Expand Down
4 changes: 2 additions & 2 deletions aisploit/datasets/prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from pathlib import Path
from typing import Optional, Sequence

from ..core.dataset import BaseDataset, YamlDeserializable
from ..core.dataset import DataclassDataset, YamlDeserializable


@dataclass
Expand All @@ -19,7 +19,7 @@ class Prompt(YamlDeserializable):
template: str


class PromptDataset(BaseDataset[Prompt]):
class PromptDataset(DataclassDataset[Prompt]):
"""Dataset for prompts."""

def __init__(self, prompts: Sequence[Prompt]) -> None:
Expand Down
4 changes: 2 additions & 2 deletions aisploit/datasets/sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from pathlib import Path
from typing import Optional, Sequence

from ..core.dataset import BaseDataset, YamlDeserializable
from ..core.dataset import DataclassDataset, YamlDeserializable


@dataclass
Expand All @@ -17,7 +17,7 @@ class Sample(YamlDeserializable):
tags: Sequence[str]


class SampleDataset(BaseDataset[Sample]):
class SampleDataset(DataclassDataset[Sample]):
"""Dataset for samples."""

def __init__(self, samples: Sequence[Sample]) -> None:
Expand Down
4 changes: 2 additions & 2 deletions aisploit/generators/auto_jailbreak.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from langchain_core.prompts.prompt import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

from ..core import BaseChatModel, BaseDataset, BaseGenerator
from ..core import BaseChatModel, BaseGenerator, DataclassDataset

DEFAULT_PATTERNS = [
"Responsibility: Invent a context that appeals to the target LLM's responsibility to act.",
Expand Down Expand Up @@ -46,7 +46,7 @@ class AutoJailbreak(BaseModel):
value: str = Field(description="the new prompt")


class AutoJailbreakDataset(BaseDataset[AutoJailbreak]):
class AutoJailbreakDataset(DataclassDataset[AutoJailbreak]):
def __init__(self, prompts: Sequence[AutoJailbreak]) -> None:
self._entries = prompts

Expand Down
48 changes: 46 additions & 2 deletions aisploit/generators/poison.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,24 @@
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts.prompt import PromptTemplate

from ..core import BaseChatModel, BaseDataset, BaseEmbeddings, BaseGenerator
from ..core import BaseChatModel, BaseEmbeddings, BaseGenerator, DataclassDataset
from ..utils import cosine_distance


@dataclass
class Poison:
"""
A class representing a poisoned input for testing language models.
Attributes:
question (str): The question to be asked.
question_embeddings (List[float]): The embeddings of the question.
target_answer (str): The desired target answer.
adversary_text (str): The adversarial text generated to elicit the target answer.
adversary_text_embeddings (List[float]): The embeddings of the adversarial text.
cosine_distance (float): The cosine distance between the question and adversarial text embeddings.
"""

question: str
question_embeddings: List[float]
target_answer: str
Expand All @@ -31,12 +43,20 @@ class Poison:
)


class PoisonDataset(BaseDataset[Poison]):
class PoisonDataset(DataclassDataset[Poison]):
"""
A dataset of poisoned inputs for testing language models.
"""

def __init__(self, poisons: Sequence[Poison]) -> None:
self._entries = poisons


class PoisonGenerator(BaseGenerator[Poison]):
"""
A generator for creating poisoned inputs for testing language models.
"""

def __init__(
self,
*,
Expand All @@ -48,6 +68,18 @@ def __init__(
max_words=30,
max_iterations=10,
) -> None:
"""
Initialize the PoisonGenerator.
Args:
question (str): The question to be asked.
answer (str): The desired target answer.
chat_model (BaseChatModel): The chat model to be used for generating adversarial text.
embeddings (BaseEmbeddings): The embeddings model to be used for calculating cosine distances.
prompt (PromptTemplate, optional): The prompt template to be used for generating adversarial text. Defaults to _template.
max_words (int, optional): The maximum number of words allowed in the adversarial text. Defaults to 30.
max_iterations (int, optional): The maximum number of iterations to try generating adversarial text. Defaults to 10.
"""
self._question = question
self._answer = answer
self._chain = prompt | chat_model | StrOutputParser()
Expand All @@ -56,6 +88,12 @@ def __init__(
self._max_iterations = max_iterations

def generate(self) -> Generator[Poison, Any, None]:
"""
Generate poisoned inputs for testing language models.
Yields:
Poison: A poisoned input for testing language models.
"""
question_embeddings = self._embeddings.embed_query(self._question)
for _ in range(self._max_iterations):
adversary_text = self._chain.invoke(
Expand All @@ -78,4 +116,10 @@ def generate(self) -> Generator[Poison, Any, None]:
)

def generate_dataset(self) -> PoisonDataset:
"""
Generate a dataset of poisoned inputs for testing language models.
Returns:
PoisonDataset: A dataset of poisoned inputs for testing language models.
"""
return PoisonDataset(list(self.generate()))
2 changes: 1 addition & 1 deletion aisploit/scanner/templates/report.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ No issues!
| Prompt | Converter | Response | RTT (seconds) |
|--------|-----------|----------|---------------|
{% for issue in issues -%}
|{{ issue.send_report_entry.prompt }}|{{ issue.send_report_entry.converter }}|{{ issue.send_report_entry.response }}| {{ issue.send_report_entry.round_trip_time }} |
|{{ issue.send_report_entry.prompt_value }}|{{ issue.send_report_entry.converter }}|{{ issue.send_report_entry.response }}| {{ issue.send_report_entry.round_trip_time }} |
{% endfor %}
{% endfor %}
{% endif %}
Loading

0 comments on commit 9a59a6a

Please sign in to comment.