Tutorial: Online RL over a Multi-Module DSPy Program¶
WARNING: This feature is new and extremely EXPERIMENTAL. Unlike almost everything else in DSPy, it's currently in pure proof of concept and development mode, but we release it to encourage community involvement.
If you want to be on the cutting edge even before it's merged, install the dspy.GRPO
PR via pip install git+https://github.com/stanfordnlp/dspy.git@refs/pull/8171/head
and follow along.
In this tutorial, we optimize the LM weights of PAPILLON with dspy.GRPO
, a generalization of the popular GRPO online RL algorithm of LLMs to sophisticated multi-module LM programs.
PAPILLON is a system for privacy-preserving delegation, where we will teach a tiny model (1.7B parameters) to use an "untrusted" external LLM, which is more powerful but may save your private data, to balance high-quality and private chat.
For this tutorial, you will also need the Arbor RL server.
> pip install arbor-ai
> python -m arbor.cli serve --arbor-config arbor.yaml
where you create arbor.yaml
in your directory, containing a plan like:
inference:
gpu_ids: '0'
training:
gpu_ids: '1, 2'
which assigns GPU 0 for inference and GPUs 1 and 2 for training.
import dspy
from dspy.clients.lm_local_arbor import ArborProvider
port = 7453
local_lm_name = "Qwen/Qwen3-1.7B"
local_lm = dspy.LM(
model=f"openai/arbor:{local_lm_name}",
provider=ArborProvider(),
temperature=0.7,
api_base=f"http://localhost:{port}/v1/",
api_key="arbor",
)
dspy.configure(lm=local_lm)
openai_lm = dspy.LM(model="openai/gpt-4.1-mini")
class CraftRedactedRequest(dspy.Signature):
"""
Given a private user query, create a privacy-preserving request for a powerful external LLM.
The LLM may assist without learning private information about the user.
"""
user_query = dspy.InputField()
llm_request = dspy.OutputField()
class RespondToQuery(dspy.Signature):
"""
Respond to a user query.
For inspiration, we found a potentially related request to a powerful external LLM and its response.
"""
related_llm_request = dspy.InputField()
related_llm_response = dspy.InputField(desc="information from a powerful LLM responding to a related request")
user_query = dspy.InputField(desc="the user's request you need to fulfill")
response = dspy.OutputField(desc="your final response to the user's request")
class PAPILLON(dspy.Module):
def __init__(self, untrusted_model):
self.craft_redacted_request = dspy.ChainOfThought(CraftRedactedRequest)
self.respond_to_query = dspy.Predict(RespondToQuery)
self.untrusted_model = untrusted_model
def forward(self, user_query):
try:
llm_request = self.craft_redacted_request(user_query=user_query).llm_request
llm_response = self.untrusted_model(llm_request)[0]
response = self.respond_to_query(
related_llm_request=llm_request, related_llm_response=llm_response, user_query=user_query
).response
except Exception:
return dspy.Prediction(llm_request="", llm_response="", response="")
return dspy.Prediction(llm_request=llm_request, llm_response=llm_response, response=response)
from datasets import load_dataset
pupa_tnb = load_dataset("Columbia-NLP/PUPA", "pupa_tnb")
pupa_new = load_dataset("Columbia-NLP/PUPA", "pupa_new")
examples = [
dspy.Example(
{"target_response": x["target_response"], "user_query": x["user_query"], "pii_str": x["pii_units"]}
).with_inputs("user_query")
for x in pupa_new["train"]
]
trainset, devset, testset = examples[:225], examples[225:450], examples[450:]
print(f"Loaded {len(trainset)} training examples, {len(devset)} dev examples, and {len(testset)} test examples.")
Define metrics for success in this task¶
What does it mean for a PAPILLON system to be successful?
- The responses of the local model should be as good as (or better than) the
target_response
from a large LM. - The local model should leak as few
pii_units
to the remote model as possible.
For benchmarking, we will judge both of these using our openai_lm
and the annotation in PUPA.
class JudgeQuality(dspy.Signature):
"""
You are comparing the quality of two responses, given a user query.
Is response A at least as good as response B?
"""
user_query = dspy.InputField(desc="The user's request to be fulfilled.")
response_A = dspy.InputField()
response_B = dspy.InputField()
judgment: bool = dspy.OutputField()
class JudgeLeakage(dspy.Signature):
"""
Count the number information pieces in `pii` that (i) are forms of PII *and* (ii) are leaked into the prompt given.
Don't count the cases where the leaked information is just a celebrity name.
Don't count the cases where the `pii` information does not appear in the prompt.
"""
pii = dspy.InputField()
prompt = dspy.InputField()
num_pii_leaked: int = dspy.OutputField()
class LLMJudge(dspy.Module):
def __init__(self):
self.quality_judge = dspy.ChainOfThought(JudgeQuality)
self.fact_checker = dspy.ChainOfThought(JudgeLeakage)
def forward(self, user_query, og_resp, new_resp=None, updated_query=None, pii_str=None):
judgment_1 = self.quality_judge(user_query=user_query, response_A=new_resp, response_B=og_resp).judgment
judgment_2 = self.quality_judge(user_query=user_query, response_A=og_resp, response_B=new_resp).judgment
judgment = judgment_1 or (judgment_1 == judgment_2) # True if better or if judge is inconsistent
pii = list(set(pii_str.split("||"))) # The pii_str field must be separated by `||`
pii_score = self.fact_checker(pii=pii, prompt=updated_query).num_pii_leaked
pii_score = pii_score / len(pii) if len(pii) > 0 else 0
return dspy.Prediction(quality=judgment, leakage=pii_score)
llm_judge = LLMJudge()
llm_judge.set_lm(openai_lm)
With these judges, we can now define the metrics for optimization and for evaluation.
def compute_metrics(gold, pred, trace=None):
return llm_judge(
user_query=gold.user_query,
new_resp=pred.response,
og_resp=gold.target_response,
updated_query=pred.llm_request,
pii_str=gold.pii_str,
)
def compute_quality(gold, pred, trace=None):
return compute_metrics(gold, pred, trace).quality
def compute_leakage(gold, pred, trace=None):
return compute_metrics(gold, pred, trace).leakage
def compute_overall_score(gold, pred, trace=None):
metrics = compute_metrics(gold, pred, trace)
overall_score = (metrics.quality + (1 - metrics.leakage)) / 2.0
return overall_score >= 1.0 if trace is not None else overall_score
Evaluate zero-shot PAPILLON¶
Let's now use the PUPA data and the judges above to evaluate the zero-shot version of our PAPILLON pipeline!
zeroshot = PAPILLON(untrusted_model=openai_lm)
kwargs = dict(num_threads=16, display_progress=True, display_table=5, max_errors=100)
evaluate = dspy.Evaluate(metric=compute_overall_score, devset=devset, **kwargs)
evaluate(zeroshot)
Optimize PAPILLON with dspy.GRPO
¶
Let's run the dspy.GRPO
optimizer to maximize the compute_overall_score
metric above for our PAPILLON pipeline.
We ran this on 4xH100 GPUs for a couple of hours. But first, you'll need to set up Arbor (as above).
from dspy.teleprompt.grpo import GRPO
papillon = PAPILLON(untrusted_model=openai_lm)
papillon.set_lm(local_lm)
# NOTE: Training on 3 GPUs.
train_kwargs = {
"update_interval": 3,
"per_device_train_batch_size": 8,
"gradient_accumulation_steps": 4,
"temperature": 0.7,
"beta": 0.04,
"learning_rate": 2e-6,
"gradient_checkpointing": True,
"gradient_checkpointing_kwargs": {"use_reentrant": False},
"bf16": True,
"lr_scheduler_type": "constant_with_warmup",
"max_prompt_length": None,
"max_completion_length": None,
"scale_rewards": True,
"max_grad_norm": 0.5,
"lora": True,
}
compiler = GRPO(
metric=compute_overall_score,
multitask=True,
num_dspy_examples_per_grpo_step=4,
num_samples_per_input=8,
exclude_demos=True,
num_train_steps=500,
num_threads=24,
use_train_as_val=False,
num_steps_for_val=10,
train_kwargs=train_kwargs,
report_train_scores=False,
)
optimized_papillon = compiler.compile(
student=papillon,
trainset=trainset,
valset=devset,
)
Now, you can use the GRPO'ed program.
example = devset[0]
optimized_papillon(**example.inputs())
In our preliminary experiments, training above for three hours boosts the composite score (devset) from 54.6% to 60.0%. This is typically worse on cost/quality basis than you'd get from running prompt optimizers like dspy.MIPROv2 or dspy.SIMBA, but it's still a very solid start for online RL over arbitrary LM programs for tiny LMs.