Commit 80b5535

committed

UPdates the packages version and Fix the Dataset loader Class to Work with Every Hugging face datasets.

1 parent 2c006e9 commit 80b5535Copy full SHA for 80b5535

File tree

2 files changed

+41

-19

lines changed

diffusionLM/utils
- datasetANDtokenizer.py
setup.py

2 files changed

+41

-19

lines changed

`‎diffusionLM/utils/datasetANDtokenizer.py‎`

Lines changed: 40 additions & 18 deletions

Original file line number	Diff line number	Diff line change
`@@ -26,26 +26,19 @@ def prepare_dataset(`
`26`	`26`	`tokenizer_name: str = "gpt2",`
`27`	`27`	`max_length: int = 1024,`
`28`	`28`	`cache_dir: Optional[str] = None,`
`29`		`- num_proc: int = 4`
	`29`	`+ num_proc: int = 4,`
	`30`	`+ text_column: Optional[str] = None`
`30`	`31`	`) -> Tuple[PYTORCH_Dataset, Optional[PYTORCH_Dataset], AutoTokenizer]:`
`31`	`32`	`"""`
`32`	`33`	`Prepare a Hugging Face dataset for training.`
`33`	`34`
`34`	`35`	`Args:`
`35`		`- dataset_name: Name of the dataset to load (e.g., "wikitext/wikitext-2-raw-v1").`
`36`		`- tokenizer_name: Name of the tokenizer to use (e.g., "gpt2").`
`37`		`- max_length: Maximum sequence length for tokenized inputs.`
`38`		`- cache_dir: Directory to cache the dataset.`
`39`		`- num_proc: Number of processes for tokenization.`
`40`		`-`
`41`		`- Returns:`
`42`		`- A tuple containing:`
`43`		`- - Train dataset (PYTORCH_Dataset)`
`44`		`- - Validation dataset (Optional[PYTORCH_Dataset])`
`45`		`- - Tokenizer (AutoTokenizer)`
`46`		`-`
`47`		`- Raises:`
`48`		`- DatasetPreparationError: If there is an issue with loading or tokenizing the dataset.`
	`36`	`+ dataset_name: Name of the dataset to load`
	`37`	`+ tokenizer_name: Name of the tokenizer to use`
	`38`	`+ max_length: Maximum sequence length`
	`39`	`+ cache_dir: Directory to cache the dataset`
	`40`	`+ num_proc: Number of processes for tokenization`
	`41`	`+ text_column: Name of the text column (auto-detect if None)`
`49`	`42`	`"""`
`50`	`43`	`try:`
`51`	`44`	`# Load tokenizer`
`@@ -68,11 +61,37 @@ def prepare_dataset(`
`68`	`61`	`except Exception as e:`
`69`	`62`	`raise DatasetPreparationError(f"Failed to load dataset {dataset_name}: {str(e)}")`
`70`	`63`
`71`		`- # Tokenize the dataset`
	`64`	`+ # Auto-detect text column if not specified`
	`65`	`+ if text_column is None:`
	`66`	`+ # Common column names for text data`
	`67`	`+ possible_columns = ['text', 'content', 'input_text', 'sentence', 'document']`
	`68`	`+ available_columns = dataset['train'].column_names`
	`69`	`+`
	`70`	`+ # Find the first matching column`
	`71`	`+ text_column = next(`
	`72`	`+ (col for col in possible_columns if col in available_columns),`
	`73`	`+ None`
	`74`	`+ )`
	`75`	`+`
	`76`	`+ # If no standard column found, look for any string column`
	`77`	`+ if text_column is None:`
	`78`	`+ for col in available_columns:`
	`79`	`+ if isinstance(dataset['train'][0][col], str):`
	`80`	`+ text_column = col`
	`81`	`+ break`
	`82`	`+`
	`83`	`+ if text_column is None:`
	`84`	`+ raise DatasetPreparationError(`
	`85`	`+ f"Could not detect text column. Available columns: {available_columns}"`
	`86`	`+ )`
	`87`	`+`
	`88`	`+ logger.info(f"Auto-detected text column: {text_column}")`
	`89`	`+`
	`90`	`+ # Tokenize function with dynamic column handling`
`72`	`91`	`def tokenize_function(examples):`
`73`	`92`	`try:`
`74`	`93`	`return tokenizer(`
`75`		`- examples["text"],`
	`94`	`+ examples[text_column],`
`76`	`95`	`padding="max_length",`
`77`	`96`	`truncation=True,`
`78`	`97`	`max_length=max_length,`
`@@ -82,11 +101,14 @@ def tokenize_function(examples):`
`82`	`101`	`raise DatasetPreparationError(f"Tokenization failed: {str(e)}")`
`83`	`102`
`84`	`103`	`logger.info("Tokenizing dataset")`
	`104`	`+ # Remove only the text column used for tokenization`
	`105`	`+ remove_columns = [text_column] if text_column in dataset['train'].column_names else None`
	`106`	`+`
`85`	`107`	`tokenized_dataset = dataset.map(`
`86`	`108`	`tokenize_function,`
`87`	`109`	`batched=True,`
`88`	`110`	`num_proc=num_proc,`
`89`		`- remove_columns=["text"]`
	`111`	`+ remove_columns=remove_columns`
`90`	`112`	`)`
`91`	`113`
`92`	`114`	`# Convert to PyTorch datasets`

`‎setup.py‎`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,7 @@`
`5`	`5`
`6`	`6`	`setup(`
`7`	`7`	`name="diffusionLM",`
`8`		`- version="0.1.7",`
	`8`	`+ version="0.1.8",`
`9`	`9`	`author="Dark Coder",`
`10`	`10`	`author_email="codewithdark90@gmail.com",`
`11`	`11`	`description="A diffusion-based language model implementation",`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

Commit 80b5535

File tree

2 files changed

2 files changed

`‎diffusionLM/utils/datasetANDtokenizer.py‎`

`‎setup.py‎`

0 commit comments