[DRAFT] [BREAKING] FEAT Refactor Harm Category as StrEnum #998

Original file line number	Diff line number	Diff line change
Expand Up		@@ -22,10 +22,7 @@
		"\n",
		"from pyrit.common import IN_MEMORY, initialize_pyrit\n",
		"from pyrit.datasets import fetch_wmdp_dataset\n",
	"from pyrit.models import (\n",
	" QuestionAnsweringEntry,\n",
	" QuestionChoice,\n",
	")\n",
	"from pyrit.models import QuestionAnsweringEntry, QuestionChoice\n",
		"from pyrit.orchestrator import QuestionAnsweringBenchmarkOrchestrator\n",
		"from pyrit.prompt_target import OpenAIChatTarget\n",
		"from pyrit.score.self_ask_question_answer_scorer import SelfAskQuestionAnswerScorer\n",
Expand Down

7 changes: 4 additions & 3 deletions pyrit/datasets/adv_bench_dataset.py

Show comments View file Open in desktop

Original file line number	Diff line number	Diff line change
Expand Up		@@ -6,8 +6,7 @@

		from pyrit.common.path import DATASETS_PATH
		from pyrit.datasets.dataset_helper import fetch_examples
	from pyrit.models import SeedPromptDataset
	from pyrit.models.seed_prompt import SeedPrompt
	from pyrit.models import HarmCategory, SeedPrompt, SeedPromptDataset


		def fetch_adv_bench_dataset(
Expand Down Expand Up		@@ -67,6 +66,8 @@ def fetch_adv_bench_dataset(
		source=str(Path(DATASETS_PATH) / "data" / "adv_bench_dataset.json"), source_type="file", cache=cache
		)

	HarmCategory._initialize_aliases()

		filtered = dataset["data"] # type: ignore

		if main_categories or sub_categories:
Expand All		@@ -87,7 +88,7 @@ def fetch_adv_bench_dataset(
		data_type="text",
		name="AdvBench Dataset [Extended]",
		dataset_name="AdvBench Dataset",
	harm_categories=item["main_categories"] + item["sub_categories"],
	harm_categories=[HarmCategory.parse(cat) for cat in item["main_categories"] + item["sub_categories"]],
		description="""AdvBench is a set of 520 harmful behaviors formulated as instructions. This dataset
		has been extended to include harm categories for better filtering and analysis. The adversary's goal
		is instead to find a single attack string that will cause the model to generate any response that
Expand Down

14 changes: 9 additions & 5 deletions pyrit/datasets/aya_redteaming_dataset.py

Show comments View file Open in desktop

Original file line number	Diff line number	Diff line change
Expand Up		@@ -6,8 +6,7 @@
		from typing import List, Literal, Optional

		from pyrit.datasets.dataset_helper import fetch_examples
	from pyrit.models import SeedPromptDataset
	from pyrit.models.seed_prompt import SeedPrompt
	from pyrit.models import HarmCategory, SeedPrompt, SeedPromptDataset


		def fetch_aya_redteaming_dataset(
Expand Down Expand Up		@@ -75,19 +74,24 @@ def fetch_aya_redteaming_dataset(
		data_home=data_home,
		)

	HarmCategory._initialize_aliases()

	parsed_filter_categories = [HarmCategory.parse(c) for c in harm_categories] if harm_categories else None

		seed_prompts = []

		for example in examples:
	categories = ast.literal_eval(example["harm_category"])
	if harm_categories is None or any(cat in categories for cat in harm_categories):
	raw_categories = ast.literal_eval(example["harm_category"])
	parsed_categories = [HarmCategory.parse(c) for c in raw_categories]
	if parsed_filter_categories is None or any(cat in parsed_categories for cat in parsed_filter_categories):
		if harm_scope is None or example["global_or_local"] == harm_scope:
		seed_prompts.append(
		SeedPrompt(
		value=example["prompt"],
		data_type="text",
		name="Aya Red-teaming Examples",
		dataset_name="Aya Red-teaming Examples",
	harm_categories=categories,
	harm_categories=parsed_categories,
		source="https://huggingface.co/datasets/CohereForAI/aya_redteaming",
		)
		)
Expand Down

7 changes: 4 additions & 3 deletions pyrit/datasets/ccp_sensitive_prompts_dataset.py

Show comments View file Open in desktop

Original file line number	Diff line number	Diff line change
Expand Up		@@ -3,8 +3,7 @@

		from datasets import load_dataset

	from pyrit.models import SeedPromptDataset
	from pyrit.models.seed_prompt import SeedPrompt
	from pyrit.models import HarmCategory, SeedPrompt, SeedPromptDataset


		def fetch_ccp_sensitive_prompts_dataset() -> SeedPromptDataset:
Expand All		@@ -25,14 +24,16 @@ def fetch_ccp_sensitive_prompts_dataset() -> SeedPromptDataset:
		split="train",
		)

	HarmCategory._initialize_aliases()

		return SeedPromptDataset(
		prompts=[
		SeedPrompt(
		value=row["prompt"],
		data_type="text",
		name="",
		dataset_name="CCP-sensitive-prompts",
	harm_categories=[row["subject"]],
	harm_categories=[HarmCategory.parse(row["subject"])],
		description=("Prompts covering topics sensitive to the CCP."),
		groups=["promptfoo"],
		source="https://huggingface.co/datasets/promptfoo/CCP-sensitive-prompts",
Expand Down

7 changes: 4 additions & 3 deletions pyrit/datasets/darkbench_dataset.py

Show comments View file Open in desktop

Original file line number	Diff line number	Diff line change
Expand Up		@@ -3,8 +3,7 @@

		from datasets import load_dataset

	from pyrit.models import SeedPromptDataset
	from pyrit.models.seed_prompt import SeedPrompt
	from pyrit.models import HarmCategory, SeedPrompt, SeedPromptDataset


		def fetch_darkbench_dataset() -> SeedPromptDataset:
Expand All		@@ -22,13 +21,15 @@ def fetch_darkbench_dataset() -> SeedPromptDataset:
		"""
		data = load_dataset("apart/darkbench", "default", split="train", data_files="darkbench.tsv")

	HarmCategory._initialize_aliases()

		seed_prompts = [
		SeedPrompt(
		value=item["Example"],
		data_type="text",
		name="",
		dataset_name="DarkBench",
	harm_categories=[item["Deceptive Pattern"]],
	harm_categories=[HarmCategory.parse(item["Deceptive Pattern"])],
		description=(
		"DarkBench is a comprehensive benchmark designed to detect dark design patterns in large language"
		" models (LLMs). Dark patterns are manipulative techniques that influence user behavior, often against"
Expand Down

7 changes: 4 additions & 3 deletions pyrit/datasets/decoding_trust_stereotypes_dataset.py

Show comments View file Open in desktop

Original file line number	Diff line number	Diff line change
Expand Up		@@ -6,8 +6,7 @@

		from pyrit.common.path import DATASETS_PATH
		from pyrit.datasets.dataset_helper import FILE_TYPE_HANDLERS, fetch_examples
	from pyrit.models import SeedPromptDataset
	from pyrit.models.seed_prompt import SeedPrompt
	from pyrit.models import HarmCategory, SeedPrompt, SeedPromptDataset


		def fetch_decoding_trust_stereotypes_dataset(
Expand Down Expand Up		@@ -51,6 +50,8 @@ def fetch_decoding_trust_stereotypes_dataset(
		Ritik Dutta, Rylan Schaeffer, Sang T. Truong, Simran Arora, Mantas Mazeika, Dan Hendrycks, Zinan Lin,
		Yu Cheng, Sanmi Koyejo, Dawn Song, Bo Li. is available at https://arxiv.org/abs//2306.11698
		"""
	HarmCategory._initialize_aliases()

		if system_prompt_type not in ["benign", "untargeted", "targeted"]:
		raise ValueError("Invalid system_prompt_type. Expected 'benign', 'untargeted', or 'targeted'}")
		# Determine the file type from the source URL
Expand Down Expand Up		@@ -105,7 +106,7 @@ def fetch_decoding_trust_stereotypes_dataset(
		data_type="text",
		name="DecodingTrust Stereotypes Examples",
		dataset_name="DecodingTrust Stereotypes Examples",
	harm_categories=["stereotypes"],
	harm_categories=[HarmCategory.parse("stereotypes")],
		description="""A dataset of Decoding Trust 'Stereotypes' example prompts across many different topics
		and target groups. These prompts attempt to convince the target to agree with a harmful
		stereotype.""",
Expand Down

7 changes: 4 additions & 3 deletions pyrit/datasets/equitymedqa_dataset.py

Show comments View file Open in desktop

Original file line number	Diff line number	Diff line change
Expand Up		@@ -5,8 +5,7 @@

		from datasets import load_dataset

	from pyrit.models import SeedPromptDataset
	from pyrit.models.seed_prompt import SeedPrompt
	from pyrit.models import HarmCategory, SeedPrompt, SeedPromptDataset

		# These are the available subsets of the EquityMedQA dataset
		DATA_SUBSETS: list[str] = [
Expand Down Expand Up		@@ -66,6 +65,8 @@ def fetch_equitymedqa_dataset_unique_values(
		"""
		prompts: list[str] = []

	HarmCategory._initialize_aliases()

		if subset_name == "all": # get all subsets
		targets: list[str] = DATA_SUBSETS

Expand All		@@ -89,7 +90,7 @@ def fetch_equitymedqa_dataset_unique_values(
		name="katielink/EquityMedQA",
		dataset_name="katielink/EquityMedQA",
		description="This dataset contains prompts used to assess medical biases in AI systems",
	harm_categories=["health_bias"],
	harm_categories=[HarmCategory.parse("health_bias")],
		source="https://huggingface.co/datasets/katielink/EquityMedQA",
		)
		for prompt in prompts
Expand Down

7 changes: 4 additions & 3 deletions pyrit/datasets/forbidden_questions_dataset.py

Show comments View file Open in desktop

Original file line number	Diff line number	Diff line change
Expand Up		@@ -3,8 +3,7 @@

		from datasets import load_dataset

	from pyrit.models import SeedPromptDataset
	from pyrit.models.seed_prompt import SeedPrompt
	from pyrit.models import HarmCategory, SeedPrompt, SeedPromptDataset


		def fetch_forbidden_questions_dataset() -> SeedPromptDataset:
Expand All		@@ -20,6 +19,8 @@ def fetch_forbidden_questions_dataset() -> SeedPromptDataset:
		"""
		data = load_dataset("TrustAIRLab/forbidden_question_set", "default")

	HarmCategory._initialize_aliases()

		authors = ["Xinyue Shen", "Zeyuan Chen", "Michael Backes", "Yun Shen", "Yang Zhang"]
		seed_prompts = [
		SeedPrompt(
Expand All		@@ -28,7 +29,7 @@ def fetch_forbidden_questions_dataset() -> SeedPromptDataset:
		name="TrustAIRLab/forbidden_question_set",
		dataset_name="TrustAIRLab/forbidden_question_set",
		authors=authors,
	harm_categories=item["content_policy_name"],
	harm_categories=[HarmCategory.parse(item["content_policy_name"])],
		source="https://huggingface.co/datasets/TrustAIRLab/forbidden_question_set",
		description="""This is the Forbidden Question Set dataset proposed in the ACM CCS 2024 paper
		"Do Anything Now'': Characterizing and Evaluating In-The-Wild Jailbreak Prompts on Large Language Models.
Expand Down

11 changes: 10 additions & 1 deletion pyrit/datasets/harmbench_dataset.py

Show comments View file Open in desktop

Original file line number	Diff line number	Diff line change
Expand Up		@@ -6,6 +6,7 @@

		from pyrit.datasets.dataset_helper import FILE_TYPE_HANDLERS, fetch_examples
		from pyrit.models import SeedPromptDataset
	from pyrit.models.harm_category import HarmCategory
		from pyrit.models.seed_prompt import SeedPrompt


Expand Down Expand Up		@@ -41,6 +42,9 @@ def fetch_harmbench_dataset(
		valid_types = ", ".join(FILE_TYPE_HANDLERS.keys())
		raise ValueError(f"Invalid file_type. Expected one of: {valid_types}.")

	# Initialize aliases for associated harm categories
	HarmCategory._initialize_aliases()

		# Required keys to validate each example
		required_keys = {"Behavior", "SemanticCategory"}

Expand All		@@ -62,13 +66,18 @@ def fetch_harmbench_dataset(
		prompts.append(example["Behavior"])
		semantic_categories.add(example["SemanticCategory"])

	# Parse the collected semantic categories into HarmCategory enums
	parsed_semantic_categories = [
	HarmCategory.parse(cat) if isinstance(cat, str) else HarmCategory.OTHER for cat in semantic_categories
	]

		seed_prompts = [
		SeedPrompt(
		value=example,
		data_type="text",
		name="HarmBench Examples",
		dataset_name="HarmBench Examples",
	harm_categories=list(semantic_categories),
	harm_categories=parsed_semantic_categories, # type: ignore
		description="A dataset of HarmBench examples containing various categories such as chemical,"
		"biological, illegal activities, etc.",
		)
Expand Down

11 changes: 7 additions & 4 deletions pyrit/datasets/librAI_do_not_answer_dataset.py

Show comments View file Open in desktop

Original file line number	Diff line number	Diff line change
Expand Up		@@ -3,8 +3,7 @@

		from datasets import load_dataset

	from pyrit.models import SeedPromptDataset
	from pyrit.models.seed_prompt import SeedPrompt
	from pyrit.models import HarmCategory, SeedPrompt, SeedPromptDataset


		def fetch_librAI_do_not_answer_dataset() -> SeedPromptDataset:
Expand All		@@ -27,14 +26,18 @@ def fetch_librAI_do_not_answer_dataset() -> SeedPromptDataset:
		"""
		# Load dataset from Hugging Face
		data = load_dataset("LibrAI/do-not-answer", split="train")

	HarmCategory._initialize_aliases()
		seed_prompts = [
		SeedPrompt(
		value=entry["question"],
		data_type="text",
		name="",
		dataset_name="LibrAI/Do-Not-Answer",
	harm_categories=[entry["risk_area"], entry["types_of_harm"], entry["specific_harms"]],
	harm_categories=[
	HarmCategory.parse(entry.get("risk_area", "")) or HarmCategory.OTHER,
	HarmCategory.parse(entry.get("types_of_harm", "")) or HarmCategory.OTHER,
	HarmCategory.parse(entry.get("specific_harms", "")) or HarmCategory.OTHER,
	],
		description=(
		f"This is a prompt from the 'Do Not Answer' dataset under the risk area: {entry['risk_area']}, "
		f"harm type: {entry['types_of_harm']}, and specific harm: {entry['specific_harms']}."
Expand Down

33 changes: 17 additions & 16 deletions pyrit/datasets/mlcommons_ailuminate_demo_dataset.py

Show comments View file Open in desktop

Original file line number	Diff line number	Diff line change
Expand Up		@@ -6,25 +6,26 @@

		from pyrit.datasets.dataset_helper import fetch_examples
		from pyrit.models import SeedPromptDataset
	from pyrit.models.harm_category import HarmCategory
		from pyrit.models.seed_prompt import SeedPrompt

		# https://github.com/mlcommons/ailuminate#hazard-categories-and-abbreviations
		_hazard_categories_map = {
	"vcr": "Violent Crimes",
	"src": "Sex-Related Crimes",
	"cse": "Child Sexual Exploitation",
	"ssh": "Suicide & Self-Harm",
	"iwp": "Indiscriminate Weapons (CBRNE)",
	"ipv": "Intellectual Property Violations",
	"dfm": "Defamation",
	"ncr": "Non-Violent Crimes",
	"hte": "Hate",
	"prv": "Privacy Violations",
	"spc_ele": "Specialized Advice; elections",
	"spc_fin": "Specialized Advice; financial",
	"spc_hlt": "Specialized Advice; health",
	"sxc_prn": "Sexual Content; pornographic",
	"spc_lgl": "Specialized Advice; legal",
	"vcr": HarmCategory.VIOLENT_CONTENT,
	"src": HarmCategory.SEXUAL_CONTENT,
	"cse": HarmCategory.SEXUAL_CONTENT,
	"ssh": HarmCategory.SELF_HARM,
	"iwp": HarmCategory.CBRN,
	"ipv": HarmCategory.PROPRIETARY_INFO,
	"dfm": HarmCategory.REPUTATIONAL_DAMAGE,
	"ncr": HarmCategory.ILLEGAL,
	"hte": HarmCategory.HATESPEECH,
	"prv": HarmCategory.PPI,
	"spc_ele": HarmCategory.CAMPAIGNING,
	"spc_fin": HarmCategory.FINANCIAL_ADVICE,
	"spc_hlt": HarmCategory.PUBLIC_HEALTH,
	"sxc_prn": HarmCategory.SEXUAL_CONTENT,
	"spc_lgl": HarmCategory.LEGAL_ADVICE,
		}


Expand Down Expand Up		@@ -58,7 +59,7 @@ def fetch_mlcommons_ailuminate_demo_dataset(
		name="",
		dataset_name="AILuminate v1.0 DEMO Prompt Set",
		# Saving both the full hazard category name and the abbreviation
	harm_categories=[_hazard_categories_map[example["hazard"]], example["hazard"]],
	harm_categories=[_hazard_categories_map.get(example.get("hazard", "").lower(), HarmCategory.OTHER)],
		description=(
		"This dataset contains the DEMO prompt library of the AILuminate 1.0 prompt dataset, created by"
		" MLCommons AI Risk & Reliability working group. It contains 1,200 human-generated prompts that"
Expand Down

5 changes: 2 additions & 3 deletions pyrit/datasets/multilingual_vulnerability_dataset.py

Show comments View file Open in desktop

Original file line number	Diff line number	Diff line change
Expand Up		@@ -3,8 +3,7 @@

		import pandas as pd

	from pyrit.models import SeedPromptDataset
	from pyrit.models.seed_prompt import SeedPrompt
	from pyrit.models import HarmCategory, SeedPrompt, SeedPromptDataset


		def fetch_multilingual_vulnerability_dataset() -> SeedPromptDataset:
Expand All		@@ -24,7 +23,7 @@ def fetch_multilingual_vulnerability_dataset() -> SeedPromptDataset:
		data_type="text",
		name=str(row["id"]),
		dataset_name="Multilingual-Vulnerability",
	harm_categories=[row["type"]],
	harm_categories=[HarmCategory.parse(row["type"])],
		description="Dataset from 'A Framework to Assess Multilingual Vulnerabilities of LLMs'. "
		"Multilingual prompts demonstrating LLM vulnerabilities, labeled by type. "
		"Paper: https://arxiv.org/pdf/2503.13081",
Expand Down

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[DRAFT] [BREAKING] FEAT Refactor Harm Category as StrEnum #998

Are you sure you want to change the base?

[DRAFT] [BREAKING] FEAT Refactor Harm Category as StrEnum #998

Filter by extension

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Uh oh!