Fix OSS 20B vLLM example: add offline-serve workflow (no flash-infer sm7+) - Update run-vllm.md #2041

Original file line number	Diff line number	Diff line change
Expand Up		@@ -170,6 +170,11 @@ uv pip install openai-harmony
		Afterwards you can use harmony to encode and parse the tokens generated by vLLM’s generate function.

		```py
	# source .oss/bin/activate

	import os
	os.environ["VLLM_USE_FLASHINFER_SAMPLER"] = "0"

		import json
		from openai_harmony import (
		HarmonyEncodingName,
Expand All		@@ -180,12 +185,13 @@ from openai_harmony import (
		SystemContent,
		DeveloperContent,
		)


		from vllm import LLM, SamplingParams
	import os

		# --- 1) Render the prefill with Harmony ---
		encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)


		convo = Conversation.from_messages(
		[
		Message.from_role_and_content(Role.SYSTEM, SystemContent.new()),
Expand All		@@ -196,37 +202,41 @@ convo = Conversation.from_messages(
		Message.from_role_and_content(Role.USER, "What is the weather like in SF?"),
		]
		)


		prefill_ids = encoding.render_conversation_for_completion(convo, Role.ASSISTANT)


		# Harmony stop tokens (pass to sampler so they won't be included in output)
		stop_token_ids = encoding.stop_tokens_for_assistant_actions()


		# --- 2) Run vLLM with prefill ---
		llm = LLM(
	model="openai/gpt-oss-120b",
	model="openai/gpt-oss-20b",
		trust_remote_code=True,
	gpu_memory_utilization = 0.95,
	# max_num_batched_tokens=4096, # Optional
	# max_model_len=5000, # Optional
	# tensor_parallel_size=1 # Optional
		)


		sampling = SamplingParams(
		max_tokens=128,
		temperature=1,
		stop_token_ids=stop_token_ids,
		)


		outputs = llm.generate(
		prompt_token_ids=[prefill_ids], # batch of size 1
		sampling_params=sampling,
		)


		# vLLM gives you both text and token IDs
		gen = outputs[0].outputs[0]
		text = gen.text
		output_tokens = gen.token_ids # <-- these are the completion token IDs (no prefill)


		# --- 3) Parse the completion token IDs back into structured Harmony messages ---
		entries = encoding.parse_messages_from_completion_tokens(output_tokens, Role.ASSISTANT)


		# 'entries' is a sequence of structured conversation entries (assistant messages, tool calls, etc.).
		for message in entries:
		print(f"{json.dumps(message.to_dict())}")
Expand Down

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Fix OSS 20B vLLM example: add offline-serve workflow (no flash-infer sm7+) - Update run-vllm.md #2041

Are you sure you want to change the base?

Uh oh!

Fix OSS 20B vLLM example: add offline-serve workflow (no flash-infer sm7+) - Update run-vllm.md #2041

Filter by extension

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing