Commit 43570b6

authored

Misc additions (#16)

1 parent 5ef38b9 commit 43570b6Copy full SHA for 43570b6

File tree

9 files changed

+93

-28

lines changed

Dockerfile
Makefile
requirements.txt
src
- main.py
- pipelines
  - pipeline.py
- utils

9 files changed

+93

-28

lines changed

`‎Dockerfile‎`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-FROM nvcr.io/nvidia/pytorch:22.11-py3`
	`1`	`+FROM nvcr.io/nvidia/pytorch:23.01-py3`
`2`	`2`
`3`	`3`	`ARG USER=1000`
`4`	`4`	`ARG USERNAME=user`

`‎Makefile‎`

Lines changed: 8 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -55,6 +55,14 @@ gpt-bigcode-mqa1:`
`55`	`55`	`gpt-bigcode-mqa2:`
`56`	`56`	`${RUN_HF} ${BIGCODE_ARGS} attention_type=3`
`57`	`57`
	`58`	`+.PHONY: santacoder-original`
	`59`	`+santacoder:`
	`60`	`+ ${RUN_HF} --pretrained_model=bigcode/santacoder --tokenizer=bigcode/santacoder --trust_remote_code ${EXP_ARGS}`
	`61`	`+`
`58`	`62`	`.PHONY: santacoder`
`59`	`63`	`santacoder:`
`60`	`64`	`${RUN_HF} --pretrained_model=bigcode/santacoder-fast-inference --tokenizer=bigcode/santacoder ${EXP_ARGS}`
	`65`	`+`
	`66`	`+.PHONY: optimized-santacoder`
	`67`	`+optimized-santacoder:`
	`68`	`+ ${RUN_HF} --pretrained_model=olivierdehaene/optimized-santacoder --tokenizer=bigcode/santacoder --trust_remote_code ${EXP_ARGS}`

`‎requirements.txt‎`

Lines changed: 1 addition & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,6 @@`
`1`	`1`	`accelerate==0.15.0`
`2`	`2`	`bitsandbytes`
	`3`	`+safetensors`
`3`	`4`	`deepspeed==0.7.7`
`4`	`5`	`-e ./transformers`
`5`	`6`

`‎src/main.py‎`

Lines changed: 2 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -14,11 +14,13 @@ def main(argv: Optional[List[str]] = None) -> None:`
`14`	`14`	`pipeline = pipeline_class(`
`15`	`15`	`model_type=args.model_type,`
`16`	`16`	`pretrained_model=args.pretrained_model,`
	`17`	`+ pretrained_config=args.pretrained_config,`
`17`	`18`	`config_args=args.config_args,`
`18`	`19`	`tokenizer=args.tokenizer,`
`19`	`20`	`device=args.device,`
`20`	`21`	`dtype=args.dtype,`
`21`	`22`	`fast_init=args.fast_init,`
	`23`	`+ trust_remote_code=args.trust_remote_code,`
`22`	`24`	`)`
`23`	`25`
`24`	`26`	`benchmark_end_to_end(`

`‎src/pipelines/pipeline.py‎`

Lines changed: 22 additions & 7 deletions

Original file line number	Diff line number	Diff line change
`@@ -9,6 +9,7 @@`
`9`	`9`
`10`	`10`	`from src.utils.fast_init import fast_init`
`11`	`11`	`from src.utils.logging import format_ms, log_rank_n`
	`12`	`+from src.utils.utils import parse_revision`
`12`	`13`	`from transformers import (`
`13`	`14`	`CONFIG_MAPPING,`
`14`	`15`	`AutoConfig,`
`@@ -41,12 +42,14 @@ def __init__(`
`41`	`42`	`self,`
`42`	`43`	`*,`
`43`	`44`	`model_type: Optional[str] = None,`
	`45`	`+ pretrained_config: Optional[str] = None,`
`44`	`46`	`pretrained_model: Optional[str] = None,`
`45`	`47`	`config_args: Dict[str, Any],`
`46`	`48`	`tokenizer: str,`
`47`	`49`	`device: torch.device,`
`48`	`50`	`dtype: torch.dtype,`
`49`	`51`	`fast_init: bool = True,`
	`52`	`+ trust_remote_code: bool = False,`
`50`	`53`	`):`
`51`	`54`	`self.initialization_metrics = {}`
`52`	`55`	`log_rank_n("*** Setting up tokenizer", logger.info)`
`@@ -60,10 +63,11 @@ def __init__(`
`60`	`63`	`self.dtype = dtype`
`61`	`64`	`self.is_int8 = self.dtype == torch.int8`
`62`	`65`	`self.fast_init = fast_init`
	`66`	`+ self.trust_remote_code = trust_remote_code`
`63`	`67`	`if self.is_int8 and self.device != torch.device("cuda"):`
`64`	`68`	`raise ValueError(f"Model quantization not supported on device {self.device}")`
`65`	`69`
`66`		`- self.config = self._get_config(model_type, pretrained_model, config_args)`
	`70`	`+ self.config = self._get_config(model_type, pretrained_configorpretrained_model, config_args)`
`67`	`71`	`t2 = time.perf_counter()`
`68`	`72`
`69`	`73`	`logger.info(f"Model configuration: {self.config}")`
`@@ -86,7 +90,9 @@ def _create_model(self) -> PreTrainedModel:`
`86`	`90`	`log_rank_n("*** Creating model", logger.info)`
`87`	`91`	`with fast_init(self.device) if self.fast_init else contextlib.nullcontext():`
`88`	`92`	`torch_dtype = torch.float16 if self.is_int8 else self.dtype`
`89`		`- model = AutoModelForCausalLM.from_config(config=self.config, torch_dtype=torch_dtype)`
	`93`	`+ model = AutoModelForCausalLM.from_config(`
	`94`	`+ config=self.config, torch_dtype=torch_dtype, trust_remote_code=self.trust_remote_code`
	`95`	`+ )`
`90`	`96`	`t1 = time.perf_counter()`
`91`	`97`	`log_rank_n("*** Moving to device", logger.info)`
`92`	`98`	`model.to(self.device)`
`@@ -98,6 +104,7 @@ def _create_model(self) -> PreTrainedModel:`
`98`	`104`	`self.initialization_metrics["model initialization"] = t1 - t0`
`99`	`105`	`self.initialization_metrics["move to device"] = t2 - t1`
`100`	`106`	`self.initialization_metrics["initialize weights"] = t3 - t2`
	`107`	`+`
`101`	`108`	`return model`
`102`	`109`
`103`	`110`	`def _reload_model(self):`
`@@ -118,9 +125,12 @@ def _load_pretrained(self, pretrained_model: str) -> PreTrainedModel:`
`118`	`125`	`log_rank_n(f"*** Loading model from {pretrained_model}", logger.info)`
`119`	`126`	`kwargs = {"load_in_8bit": True, "device_map": "auto"} if self.is_int8 else {"torch_dtype": self.dtype}`
`120`	`127`	`with fast_init(self.device) if self.fast_init else contextlib.nullcontext():`
	`128`	`+ pretrained_model, revision = parse_revision(pretrained_model)`
`121`	`129`	`model = AutoModelForCausalLM.from_pretrained(`
`122`	`130`	`pretrained_model,`
	`131`	`+ revision=revision,`
`123`	`132`	`config=self.config,`
	`133`	`+ trust_remote_code=self.trust_remote_code,`
`124`	`134`	`**kwargs,`
`125`	`135`	`)`
`126`	`136`	`t1 = time.perf_counter()`
`@@ -135,7 +145,7 @@ def _load_pretrained(self, pretrained_model: str) -> PreTrainedModel:`
`135`	`145`	`def _get_config(`
`136`	`146`	`self,`
`137`	`147`	`model_type: Optional[str],`
`138`		`- pretrained_model: Optional[str],`
	`148`	`+ pretrained_config: Optional[str],`
`139`	`149`	`config_args: Dict[str, Any],`
`140`	`150`	`) -> PretrainedConfig:`
`141`	`151`	`config_args = {`
`@@ -145,15 +155,16 @@ def _get_config(`
`145`	`155`	`}`
`146`	`156`
`147`	`157`	`if model_type is None:`
`148`		`- if pretrained_model is None:`
	`158`	`+ if pretrained_config is None:`
`149`	`159`	`raise ValueError("You need to provide either --model_type or --pretrained_model")`
`150`	`160`	`config_class = AutoConfig`
`151`	`161`	`elif model_type not in CONFIG_MAPPING:`
`152`	`162`	`raise ValueError(f"Unknown model type: {model_type}")`
`153`	`163`	`else:`
`154`	`164`	`config_class = CONFIG_MAPPING[model_type]`
	`165`	`+ config_args["model_type"] = model_type`
`155`	`166`
`156`		`- if pretrained_model is None:`
	`167`	`+ if pretrained_config is None:`
`157`	`168`	`config_args.update(`
`158`	`169`	`{`
`159`	`170`	`"bos_token_id": self.tokenizer.bos_token_id,`
`@@ -163,7 +174,10 @@ def _get_config(`
`163`	`174`	`)`
`164`	`175`	`config, unused = config_class.from_dict({}, **config_args)`
`165`	`176`	`else:`
`166`		`- config, unused = config_class.from_pretrained(pretrained_model, **config_args)`
	`177`	`+ pretrained_config, revision = parse_revision(pretrained_config)`
	`178`	`+ config, unused = config_class.from_pretrained(`
	`179`	`+ pretrained_config, revision=revision, trust_remote_code=self.trust_remote_code, **config_args`
	`180`	`+ )`
`167`	`181`
`168`	`182`	`if unused:`
`169`	`183`	`raise ValueError(f"There were unused configuration parameters: {tuple(unused)}")`
`@@ -216,7 +230,8 @@ def aggregate_and_format_metrics(self, metrics: List[Dict[str, Any]]):`
`216`	`230`	`"Latency (decode)": format_ms(mean_metrics[DECODE_TIME]),`
`217`	`231`	`"Latency (max)": format_ms(max(all_metrics[END_TO_END_TIME])),`
`218`	`232`	`"Latency (min)": format_ms(min(all_metrics[END_TO_END_TIME])),`
`219`		`- "Tokens generated": f"{mean_metrics[NUM_GENERATED_TOKENS]:.0f}",`
	`233`	`+ "Tokens generated (average)": f"{mean_metrics[NUM_GENERATED_TOKENS]:.0f}",`
	`234`	`+ "Tokens generated (total)": f"{np.sum(all_metrics[NUM_GENERATED_TOKENS]).item():.0f}",`
`220`	`235`	`"Throughput (model)": f"{model_throughput:.2f} tokens/s",`
`221`	`236`	`"Throughput (end to end)": f"{throughput:.2f} tokens/s",`
`222`	`237`	`"Token time (end to end)": f"{format_ms(throughput ** -1)}/token",`

`‎src/utils/arguments.py‎`

Lines changed: 11 additions & 5 deletions

Original file line number	Diff line number	Diff line change
`@@ -11,8 +11,10 @@ def get_arg_parser() -> ArgumentParser:`
`11`	`11`
`12`	`12`	`# Model`
`13`	`13`	`parser.add_argument("--model_type")`
	`14`	`+ parser.add_argument("--pretrained_config")`
`14`	`15`	`parser.add_argument("--pretrained_model")`
`15`	`16`	`parser.add_argument("--tokenizer", default="gpt2")`
	`17`	`+ parser.add_argument("--trust_remote_code", action="store_true")`
`16`	`18`	`parser.add_argument("config_args", nargs="*")`
`17`	`19`
`18`	`20`	`# Runtime`
`@@ -47,10 +49,14 @@ def get_arg_parser() -> ArgumentParser:`
`47`	`49`	`def parse_config_args(config_args: List[str]) -> typing.Dict[str, Any]:`
`48`	`50`	`parsed_config_args = {}`
`49`	`51`	`for config_arg in config_args:`
`50`		`- try:`
`51`		`- key, value = [x.strip() for x in config_arg.split("=")]`
`52`		`- except ValueError:`
`53`		`- raise ValueError(f"Cannot parse argument: {config_arg}")`
	`52`	`+ split_arg = [x.strip() for x in config_arg.split("=", 1)]`
	`53`	`+ if len(split_arg) != 2:`
	`54`	`+ raise ValueError(f"Cannot parse argument (not in 'key=value' format): {config_arg}")`
	`55`	`+ key, value = split_arg`
	`56`	`+ if not key.isidentifier():`
	`57`	`+ raise ValueError(f"Invalid argument (not a python identifier): {key}")`
	`58`	`+ if key in parsed_config_args:`
	`59`	`+ raise ValueError(f"Duplicate argument: {key}")`
`54`	`60`	`if value.lower() == "true":`
`55`	`61`	`value = True`
`56`	`62`	`elif value.lower() == "false":`
`@@ -65,7 +71,7 @@ def parse_config_args(config_args: List[str]) -> typing.Dict[str, Any]:`
`65`	`71`	`value = float(value)`
`66`	`72`	`except ValueError:`
`67`	`73`	`pass`
`68`		`- parsed_config_args[key.strip()] = value`
	`74`	`+ parsed_config_args[key] = value`
`69`	`75`	`return parsed_config_args`
`70`	`76`
`71`	`77`

`‎src/utils/benchmark.py‎`

Lines changed: 34 additions & 14 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,12 +1,13 @@`
`1`	`1`	`import contextlib`
`2`	`2`	`import gc`
`3`	`3`	`import logging`
	`4`	`+import time`
`4`	`5`	`from typing import List, Union`
`5`	`6`
`6`	`7`	`import torch`
`7`	`8`
`8`	`9`	`from src.pipelines.pipeline import Pipeline`
`9`		`-from src.utils.logging import format_ms, log_dict, log_rank_n`
	`10`	`+from src.utils.logging import format_mib, format_ms, log_dict, log_rank_n`
`10`	`11`
`11`	`12`
`12`	`13`	`logger = logging.getLogger(__name__)`
`@@ -91,8 +92,27 @@ def benchmark_end_to_end(`
`91`	`92`	`else:`
`92`	`93`	`profiler = contextlib.nullcontext()`
`93`	`94`
	`95`	`+ benchmark_stats = {`
	`96`	`+ "Model parameters": pipeline.get_num_parameters(),`
	`97`	`+ "Batch size": len(inputs),`
	`98`	`+ **generate_kwargs,`
	`99`	`+ **pipeline.get_initialization_metrics(),`
	`100`	`+ "Warmup cycles": skip + warmup,`
	`101`	`+ "Benchmark cycles": cycles,`
	`102`	`+ "Total cycles": skip + warmup + cycles,`
	`103`	`+ }`
	`104`	`+`
	`105`	`+ if pipeline.device.type == "cuda":`
	`106`	`+ benchmark_stats["Initial memory used"] = format_mib(torch.cuda.memory_allocated())`
	`107`	`+ benchmark_stats["Initial memory reserved"] = format_mib(torch.cuda.memory_reserved())`
	`108`	`+ torch.cuda.reset_peak_memory_stats()`
	`109`	`+`
	`110`	`+ t0 = time.perf_counter()`
`94`	`111`	`with profiler as p:`
`95`	`112`	`for step in range(skip + warmup + cycles):`
	`113`	`+ if step == skip + warmup:`
	`114`	`+ t1 = time.perf_counter()`
	`115`	`+ benchmark_stats["Warmup time"] = format_ms(t1 - t0)`
`96`	`116`	`generated_text, metrics = pipeline(inputs, **generate_kwargs)`
`97`	`117`	`if profile:`
`98`	`118`	`p.step()`
`@@ -108,18 +128,18 @@ def benchmark_end_to_end(`
`108`	`128`	`torch.cuda.synchronize()`
`109`	`129`	`gc.collect()`
`110`	`130`	`torch.cuda.empty_cache()`
	`131`	`+ if pipeline.device.type == "cuda":`
	`132`	`+ benchmark_stats["Memory used"] = format_mib(torch.cuda.memory_allocated())`
	`133`	`+ benchmark_stats["Memory reserved"] = format_mib(torch.cuda.memory_reserved())`
	`134`	`+ benchmark_stats["Max memory used"] = format_mib(torch.cuda.max_memory_allocated())`
	`135`	`+ benchmark_stats["Max memory reserved"] = format_mib(torch.cuda.max_memory_reserved())`
	`136`	`+`
	`137`	`+ t2 = time.perf_counter()`
	`138`	`+ benchmark_stats["Benchmark time"] = format_ms(t2 - t1)`
	`139`	`+ benchmark_stats["Total time"] = format_ms(t2 - t0)`
`111`	`140`
`112`	`141`	`if len(all_metrics) > 0:`
`113`		`- log_rank_n("*** Performance metrics:", logger.info)`
`114`		`- log_dict(pipeline.aggregate_and_format_metrics(all_metrics), logger.info)`
`115`		`-`
`116`		`- log_rank_n("*** Benchmarking stats:", logger.info)`
`117`		`- log_dict(`
`118`		`- {`
`119`		`- "Model parameters": pipeline.get_num_parameters(),`
`120`		`- "Batch size": len(inputs),`
`121`		`- **generate_kwargs,`
`122`		`- **pipeline.get_initialization_metrics(),`
`123`		`- },`
`124`		`- logger.info,`
`125`		`- )`
	`142`	`+ benchmark_stats.update(pipeline.aggregate_and_format_metrics(all_metrics))`
	`143`	`+`
	`144`	`+ log_rank_n("*** Benchmark results:", logger.info)`
	`145`	`+ log_dict(benchmark_stats, logger.info)`

`‎src/utils/logging.py‎`

Lines changed: 4 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -43,3 +43,7 @@ def log_dict(data: dict, logger: Callable = logging.info, rank: int = 0):`
`43`	`43`
`44`	`44`	`def format_ms(t: float):`
`45`	`45`	`return f"{1000 * t:.2f} ms"`
	`46`	`+`
	`47`	`+`
	`48`	`+def format_mib(m: float):`
	`49`	`+ return f"{m/2**20:.0f} MiB"`

`‎src/utils/utils.py‎`

Lines changed: 10 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`import time`
`2`	`2`	`from functools import partial`
`3`		`-from typing import Any, List, Tuple, Union`
	`3`	`+from typing import Any, List, Optional, Tuple, Union`
`4`	`4`
`5`	`5`
`6`	`6`	`def run_and_log_time(execs: Union[List[partial], partial]) -> Tuple[Union[List[Any], Any], float]:`
`@@ -16,3 +16,12 @@ def run_and_log_time(execs: Union[List[partial], partial]) -> Tuple[Union[List[A`
`16`	`16`
`17`	`17`	`time_elapsed = time.perf_counter() - start_time`
`18`	`18`	`return results, time_elapsed`
	`19`	`+`
	`20`	`+`
	`21`	`+def parse_revision(pretrained_model: Optional[str]) -> Tuple[Optional[str], Optional[str]]:`
	`22`	`+ revision = None`
	`23`	`+ if pretrained_model is not None:`
	`24`	`+ pretrained_split = pretrained_model.split(":", 1)`
	`25`	`+ if len(pretrained_split) == 2:`
	`26`	`+ pretrained_model, revision = pretrained_split`
	`27`	`+ return pretrained_model, revision`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit 43570b6

File tree

9 files changed

9 files changed

`‎Dockerfile‎`

`‎Makefile‎`

`‎requirements.txt‎`

`‎src/main.py‎`

`‎src/pipelines/pipeline.py‎`

`‎src/utils/arguments.py‎`

`‎src/utils/benchmark.py‎`

`‎src/utils/logging.py‎`

`‎src/utils/utils.py‎`

0 commit comments