Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

[DRAFT] [BREAKING] FEAT Refactor Harm Category as StrEnum #998

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
eugeniavkim wants to merge 12 commits into Azure:main
base: main
Choose a base branch
Loading
from eugeniavkim:users/eugeniakim/refactor-harm-categories
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
12 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions doc/code/orchestrators/qa_benchmark_orchestrator.ipynb
View file Open in desktop
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,7 @@
"\n",
"from pyrit.common import IN_MEMORY, initialize_pyrit\n",
"from pyrit.datasets import fetch_wmdp_dataset\n",
"from pyrit.models import (\n",
" QuestionAnsweringEntry,\n",
" QuestionChoice,\n",
")\n",
"from pyrit.models import QuestionAnsweringEntry, QuestionChoice\n",
"from pyrit.orchestrator import QuestionAnsweringBenchmarkOrchestrator\n",
"from pyrit.prompt_target import OpenAIChatTarget\n",
"from pyrit.score.self_ask_question_answer_scorer import SelfAskQuestionAnswerScorer\n",
Expand Down
7 changes: 4 additions & 3 deletions pyrit/datasets/adv_bench_dataset.py
View file Open in desktop
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@

from pyrit.common.path import DATASETS_PATH
from pyrit.datasets.dataset_helper import fetch_examples
from pyrit.models import SeedPromptDataset
from pyrit.models.seed_prompt import SeedPrompt
from pyrit.models import HarmCategory, SeedPrompt, SeedPromptDataset


def fetch_adv_bench_dataset(
Expand Down Expand Up @@ -67,6 +66,8 @@ def fetch_adv_bench_dataset(
source=str(Path(DATASETS_PATH) / "data" / "adv_bench_dataset.json"), source_type="file", cache=cache
)

HarmCategory._initialize_aliases()

filtered = dataset["data"] # type: ignore

if main_categories or sub_categories:
Expand All @@ -87,7 +88,7 @@ def fetch_adv_bench_dataset(
data_type="text",
name="AdvBench Dataset [Extended]",
dataset_name="AdvBench Dataset",
harm_categories=item["main_categories"] + item["sub_categories"],
harm_categories=[HarmCategory.parse(cat) for cat in item["main_categories"] + item["sub_categories"]],
description="""AdvBench is a set of 520 harmful behaviors formulated as instructions. This dataset
has been extended to include harm categories for better filtering and analysis. The adversary's goal
is instead to find a single attack string that will cause the model to generate any response that
Expand Down
14 changes: 9 additions & 5 deletions pyrit/datasets/aya_redteaming_dataset.py
View file Open in desktop
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@
from typing import List, Literal, Optional

from pyrit.datasets.dataset_helper import fetch_examples
from pyrit.models import SeedPromptDataset
from pyrit.models.seed_prompt import SeedPrompt
from pyrit.models import HarmCategory, SeedPrompt, SeedPromptDataset


def fetch_aya_redteaming_dataset(
Expand Down Expand Up @@ -75,19 +74,24 @@ def fetch_aya_redteaming_dataset(
data_home=data_home,
)

HarmCategory._initialize_aliases()

parsed_filter_categories = [HarmCategory.parse(c) for c in harm_categories] if harm_categories else None

seed_prompts = []

for example in examples:
categories = ast.literal_eval(example["harm_category"])
if harm_categories is None or any(cat in categories for cat in harm_categories):
raw_categories = ast.literal_eval(example["harm_category"])
parsed_categories = [HarmCategory.parse(c) for c in raw_categories]
if parsed_filter_categories is None or any(cat in parsed_categories for cat in parsed_filter_categories):
if harm_scope is None or example["global_or_local"] == harm_scope:
seed_prompts.append(
SeedPrompt(
value=example["prompt"],
data_type="text",
name="Aya Red-teaming Examples",
dataset_name="Aya Red-teaming Examples",
harm_categories=categories,
harm_categories=parsed_categories,
source="https://huggingface.co/datasets/CohereForAI/aya_redteaming",
)
)
Expand Down
7 changes: 4 additions & 3 deletions pyrit/datasets/ccp_sensitive_prompts_dataset.py
View file Open in desktop
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@

from datasets import load_dataset

from pyrit.models import SeedPromptDataset
from pyrit.models.seed_prompt import SeedPrompt
from pyrit.models import HarmCategory, SeedPrompt, SeedPromptDataset


def fetch_ccp_sensitive_prompts_dataset() -> SeedPromptDataset:
Expand All @@ -25,14 +24,16 @@ def fetch_ccp_sensitive_prompts_dataset() -> SeedPromptDataset:
split="train",
)

HarmCategory._initialize_aliases()

return SeedPromptDataset(
prompts=[
SeedPrompt(
value=row["prompt"],
data_type="text",
name="",
dataset_name="CCP-sensitive-prompts",
harm_categories=[row["subject"]],
harm_categories=[HarmCategory.parse(row["subject"])],
description=("Prompts covering topics sensitive to the CCP."),
groups=["promptfoo"],
source="https://huggingface.co/datasets/promptfoo/CCP-sensitive-prompts",
Expand Down
7 changes: 4 additions & 3 deletions pyrit/datasets/darkbench_dataset.py
View file Open in desktop
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@

from datasets import load_dataset

from pyrit.models import SeedPromptDataset
from pyrit.models.seed_prompt import SeedPrompt
from pyrit.models import HarmCategory, SeedPrompt, SeedPromptDataset


def fetch_darkbench_dataset() -> SeedPromptDataset:
Expand All @@ -22,13 +21,15 @@ def fetch_darkbench_dataset() -> SeedPromptDataset:
"""
data = load_dataset("apart/darkbench", "default", split="train", data_files="darkbench.tsv")

HarmCategory._initialize_aliases()

seed_prompts = [
SeedPrompt(
value=item["Example"],
data_type="text",
name="",
dataset_name="DarkBench",
harm_categories=[item["Deceptive Pattern"]],
harm_categories=[HarmCategory.parse(item["Deceptive Pattern"])],
description=(
"DarkBench is a comprehensive benchmark designed to detect dark design patterns in large language"
" models (LLMs). Dark patterns are manipulative techniques that influence user behavior, often against"
Expand Down
7 changes: 4 additions & 3 deletions pyrit/datasets/decoding_trust_stereotypes_dataset.py
View file Open in desktop
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@

from pyrit.common.path import DATASETS_PATH
from pyrit.datasets.dataset_helper import FILE_TYPE_HANDLERS, fetch_examples
from pyrit.models import SeedPromptDataset
from pyrit.models.seed_prompt import SeedPrompt
from pyrit.models import HarmCategory, SeedPrompt, SeedPromptDataset


def fetch_decoding_trust_stereotypes_dataset(
Expand Down Expand Up @@ -51,6 +50,8 @@ def fetch_decoding_trust_stereotypes_dataset(
Ritik Dutta, Rylan Schaeffer, Sang T. Truong, Simran Arora, Mantas Mazeika, Dan Hendrycks, Zinan Lin,
Yu Cheng, Sanmi Koyejo, Dawn Song, Bo Li. is available at https://arxiv.org/abs//2306.11698
"""
HarmCategory._initialize_aliases()

if system_prompt_type not in ["benign", "untargeted", "targeted"]:
raise ValueError("Invalid system_prompt_type. Expected 'benign', 'untargeted', or 'targeted'}")
# Determine the file type from the source URL
Expand Down Expand Up @@ -105,7 +106,7 @@ def fetch_decoding_trust_stereotypes_dataset(
data_type="text",
name="DecodingTrust Stereotypes Examples",
dataset_name="DecodingTrust Stereotypes Examples",
harm_categories=["stereotypes"],
harm_categories=[HarmCategory.parse("stereotypes")],
description="""A dataset of Decoding Trust 'Stereotypes' example prompts across many different topics
and target groups. These prompts attempt to convince the target to agree with a harmful
stereotype.""",
Expand Down
7 changes: 4 additions & 3 deletions pyrit/datasets/equitymedqa_dataset.py
View file Open in desktop
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@

from datasets import load_dataset

from pyrit.models import SeedPromptDataset
from pyrit.models.seed_prompt import SeedPrompt
from pyrit.models import HarmCategory, SeedPrompt, SeedPromptDataset

# These are the available subsets of the EquityMedQA dataset
DATA_SUBSETS: list[str] = [
Expand Down Expand Up @@ -66,6 +65,8 @@ def fetch_equitymedqa_dataset_unique_values(
"""
prompts: list[str] = []

HarmCategory._initialize_aliases()

if subset_name == "all": # get all subsets
targets: list[str] = DATA_SUBSETS

Expand All @@ -89,7 +90,7 @@ def fetch_equitymedqa_dataset_unique_values(
name="katielink/EquityMedQA",
dataset_name="katielink/EquityMedQA",
description="This dataset contains prompts used to assess medical biases in AI systems",
harm_categories=["health_bias"],
harm_categories=[HarmCategory.parse("health_bias")],
source="https://huggingface.co/datasets/katielink/EquityMedQA",
)
for prompt in prompts
Expand Down
7 changes: 4 additions & 3 deletions pyrit/datasets/forbidden_questions_dataset.py
View file Open in desktop
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@

from datasets import load_dataset

from pyrit.models import SeedPromptDataset
from pyrit.models.seed_prompt import SeedPrompt
from pyrit.models import HarmCategory, SeedPrompt, SeedPromptDataset


def fetch_forbidden_questions_dataset() -> SeedPromptDataset:
Expand All @@ -20,6 +19,8 @@ def fetch_forbidden_questions_dataset() -> SeedPromptDataset:
"""
data = load_dataset("TrustAIRLab/forbidden_question_set", "default")

HarmCategory._initialize_aliases()

authors = ["Xinyue Shen", "Zeyuan Chen", "Michael Backes", "Yun Shen", "Yang Zhang"]
seed_prompts = [
SeedPrompt(
Expand All @@ -28,7 +29,7 @@ def fetch_forbidden_questions_dataset() -> SeedPromptDataset:
name="TrustAIRLab/forbidden_question_set",
dataset_name="TrustAIRLab/forbidden_question_set",
authors=authors,
harm_categories=item["content_policy_name"],
harm_categories=[HarmCategory.parse(item["content_policy_name"])],
source="https://huggingface.co/datasets/TrustAIRLab/forbidden_question_set",
description="""This is the Forbidden Question Set dataset proposed in the ACM CCS 2024 paper
"Do Anything Now'': Characterizing and Evaluating In-The-Wild Jailbreak Prompts on Large Language Models.
Expand Down
11 changes: 10 additions & 1 deletion pyrit/datasets/harmbench_dataset.py
View file Open in desktop
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from pyrit.datasets.dataset_helper import FILE_TYPE_HANDLERS, fetch_examples
from pyrit.models import SeedPromptDataset
from pyrit.models.harm_category import HarmCategory
from pyrit.models.seed_prompt import SeedPrompt


Expand Down Expand Up @@ -41,6 +42,9 @@ def fetch_harmbench_dataset(
valid_types = ", ".join(FILE_TYPE_HANDLERS.keys())
raise ValueError(f"Invalid file_type. Expected one of: {valid_types}.")

# Initialize aliases for associated harm categories
HarmCategory._initialize_aliases()

# Required keys to validate each example
required_keys = {"Behavior", "SemanticCategory"}

Expand All @@ -62,13 +66,18 @@ def fetch_harmbench_dataset(
prompts.append(example["Behavior"])
semantic_categories.add(example["SemanticCategory"])

# Parse the collected semantic categories into HarmCategory enums
parsed_semantic_categories = [
HarmCategory.parse(cat) if isinstance(cat, str) else HarmCategory.OTHER for cat in semantic_categories
]

seed_prompts = [
SeedPrompt(
value=example,
data_type="text",
name="HarmBench Examples",
dataset_name="HarmBench Examples",
harm_categories=list(semantic_categories),
harm_categories=parsed_semantic_categories, # type: ignore
description="A dataset of HarmBench examples containing various categories such as chemical,"
"biological, illegal activities, etc.",
)
Expand Down
11 changes: 7 additions & 4 deletions pyrit/datasets/librAI_do_not_answer_dataset.py
View file Open in desktop
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@

from datasets import load_dataset

from pyrit.models import SeedPromptDataset
from pyrit.models.seed_prompt import SeedPrompt
from pyrit.models import HarmCategory, SeedPrompt, SeedPromptDataset


def fetch_librAI_do_not_answer_dataset() -> SeedPromptDataset:
Expand All @@ -27,14 +26,18 @@ def fetch_librAI_do_not_answer_dataset() -> SeedPromptDataset:
"""
# Load dataset from Hugging Face
data = load_dataset("LibrAI/do-not-answer", split="train")

HarmCategory._initialize_aliases()
seed_prompts = [
SeedPrompt(
value=entry["question"],
data_type="text",
name="",
dataset_name="LibrAI/Do-Not-Answer",
harm_categories=[entry["risk_area"], entry["types_of_harm"], entry["specific_harms"]],
harm_categories=[
HarmCategory.parse(entry.get("risk_area", "")) or HarmCategory.OTHER,
HarmCategory.parse(entry.get("types_of_harm", "")) or HarmCategory.OTHER,
HarmCategory.parse(entry.get("specific_harms", "")) or HarmCategory.OTHER,
],
description=(
f"This is a prompt from the 'Do Not Answer' dataset under the risk area: {entry['risk_area']}, "
f"harm type: {entry['types_of_harm']}, and specific harm: {entry['specific_harms']}."
Expand Down
33 changes: 17 additions & 16 deletions pyrit/datasets/mlcommons_ailuminate_demo_dataset.py
View file Open in desktop
Original file line number Diff line number Diff line change
Expand Up @@ -6,25 +6,26 @@

from pyrit.datasets.dataset_helper import fetch_examples
from pyrit.models import SeedPromptDataset
from pyrit.models.harm_category import HarmCategory
from pyrit.models.seed_prompt import SeedPrompt

# https://github.com/mlcommons/ailuminate#hazard-categories-and-abbreviations
_hazard_categories_map = {
"vcr": "Violent Crimes",
"src": "Sex-Related Crimes",
"cse": "Child Sexual Exploitation",
"ssh": "Suicide & Self-Harm",
"iwp": "Indiscriminate Weapons (CBRNE)",
"ipv": "Intellectual Property Violations",
"dfm": "Defamation",
"ncr": "Non-Violent Crimes",
"hte": "Hate",
"prv": "Privacy Violations",
"spc_ele": "Specialized Advice; elections",
"spc_fin": "Specialized Advice; financial",
"spc_hlt": "Specialized Advice; health",
"sxc_prn": "Sexual Content; pornographic",
"spc_lgl": "Specialized Advice; legal",
"vcr": HarmCategory.VIOLENT_CONTENT,
"src": HarmCategory.SEXUAL_CONTENT,
"cse": HarmCategory.SEXUAL_CONTENT,
"ssh": HarmCategory.SELF_HARM,
"iwp": HarmCategory.CBRN,
"ipv": HarmCategory.PROPRIETARY_INFO,
"dfm": HarmCategory.REPUTATIONAL_DAMAGE,
"ncr": HarmCategory.ILLEGAL,
"hte": HarmCategory.HATESPEECH,
"prv": HarmCategory.PPI,
"spc_ele": HarmCategory.CAMPAIGNING,
"spc_fin": HarmCategory.FINANCIAL_ADVICE,
"spc_hlt": HarmCategory.PUBLIC_HEALTH,
"sxc_prn": HarmCategory.SEXUAL_CONTENT,
"spc_lgl": HarmCategory.LEGAL_ADVICE,
}


Expand Down Expand Up @@ -58,7 +59,7 @@ def fetch_mlcommons_ailuminate_demo_dataset(
name="",
dataset_name="AILuminate v1.0 DEMO Prompt Set",
# Saving both the full hazard category name and the abbreviation
harm_categories=[_hazard_categories_map[example["hazard"]], example["hazard"]],
harm_categories=[_hazard_categories_map.get(example.get("hazard", "").lower(), HarmCategory.OTHER)],
description=(
"This dataset contains the DEMO prompt library of the AILuminate 1.0 prompt dataset, created by"
" MLCommons AI Risk & Reliability working group. It contains 1,200 human-generated prompts that"
Expand Down
5 changes: 2 additions & 3 deletions pyrit/datasets/multilingual_vulnerability_dataset.py
View file Open in desktop
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@

import pandas as pd

from pyrit.models import SeedPromptDataset
from pyrit.models.seed_prompt import SeedPrompt
from pyrit.models import HarmCategory, SeedPrompt, SeedPromptDataset


def fetch_multilingual_vulnerability_dataset() -> SeedPromptDataset:
Expand All @@ -24,7 +23,7 @@ def fetch_multilingual_vulnerability_dataset() -> SeedPromptDataset:
data_type="text",
name=str(row["id"]),
dataset_name="Multilingual-Vulnerability",
harm_categories=[row["type"]],
harm_categories=[HarmCategory.parse(row["type"])],
description="Dataset from 'A Framework to Assess Multilingual Vulnerabilities of LLMs'. "
"Multilingual prompts demonstrating LLM vulnerabilities, labeled by type. "
"Paper: https://arxiv.org/pdf/2503.13081",
Expand Down
Loading
Loading

AltStyle によって変換されたページ (->オリジナル) /