Tutorial: Online RL over a Multi-Module DSPy Program¶

WARNING: This feature is new and extremely EXPERIMENTAL. Unlike almost everything else in DSPy, it's currently in pure proof of concept and development mode, but we release it to encourage community involvement.

In this tutorial, we optimize the LM weights of PAPILLON with dspy.GRPO, a generalization of the popular GRPO online RL algorithm of LLMs to sophisticated multi-module LM programs.

PAPILLON is a system for privacy-preserving delegation, where we will teach a tiny model (1.7B parameters) to use an "untrusted" external LLM, which is more powerful but may save your private data, to balance high-quality and private chat.

For this tutorial, you will also need the Arbor RL server.

> pip install arbor-ai
> python -m arbor.cli serve --arbor-config arbor.yaml

where you create arbor.yaml in your directory, containing a plan like:

inference:
  gpu_ids: '0'

training:
  gpu_ids: '1, 2'

which assigns GPU 0 for inference and GPUs 1 and 2 for training.

In [ ]:

Copied!





import dspy
from dspy.clients.lm_local_arbor import ArborProvider

port = 7453
local_lm_name = "Qwen/Qwen3-1.7B"
local_lm = dspy.LM(
    model=f"openai/arbor:{local_lm_name}",
    provider=ArborProvider(),
    temperature=0.7,
    api_base=f"http://localhost:{port}/v1/",
    api_key="arbor",
)

dspy.configure(lm=local_lm)

openai_lm = dspy.LM(model="openai/gpt-4.1-mini")
import dspy
from dspy.clients.lm_local_arbor import ArborProvider

port = 7453
local_lm_name = "Qwen/Qwen3-1.7B"
local_lm = dspy.LM(
    model=f"openai/arbor:{local_lm_name}",
    provider=ArborProvider(),
    temperature=0.7,
    api_base=f"http://localhost:{port}/v1/",
    api_key="arbor",
)

dspy.configure(lm=local_lm)

openai_lm = dspy.LM(model="openai/gpt-4.1-mini")

In [ ]:

Copied!





class CraftRedactedRequest(dspy.Signature):
    """
    Given a private user query, create a privacy-preserving request for a powerful external LLM.
    The LLM may assist without learning private information about the user.
    """

    user_query = dspy.InputField()
    llm_request = dspy.OutputField()


class RespondToQuery(dspy.Signature):
    """
    Respond to a user query.
    For inspiration, we found a potentially related request to a powerful external LLM and its response.
    """

    related_llm_request = dspy.InputField()
    related_llm_response = dspy.InputField(desc="information from a powerful LLM responding to a related request")
    user_query = dspy.InputField(desc="the user's request you need to fulfill")
    response = dspy.OutputField(desc="your final response to the user's request")


class PAPILLON(dspy.Module):
    def __init__(self, untrusted_model):
        self.craft_redacted_request = dspy.ChainOfThought(CraftRedactedRequest)
        self.respond_to_query = dspy.Predict(RespondToQuery)
        self.untrusted_model = untrusted_model

    def forward(self, user_query):
        try:
            llm_request = self.craft_redacted_request(user_query=user_query).llm_request
            llm_response = self.untrusted_model(llm_request)[0]
            response = self.respond_to_query(
                related_llm_request=llm_request, related_llm_response=llm_response, user_query=user_query
            ).response
        except Exception:
            return dspy.Prediction(llm_request="", llm_response="", response="")

        return dspy.Prediction(llm_request=llm_request, llm_response=llm_response, response=response)
class CraftRedactedRequest(dspy.Signature):
    """
    Given a private user query, create a privacy-preserving request for a powerful external LLM.
    The LLM may assist without learning private information about the user.
    """

    user_query = dspy.InputField()
    llm_request = dspy.OutputField()


class RespondToQuery(dspy.Signature):
    """
    Respond to a user query.
    For inspiration, we found a potentially related request to a powerful external LLM and its response.
    """

    related_llm_request = dspy.InputField()
    related_llm_response = dspy.InputField(desc="information from a powerful LLM responding to a related request")
    user_query = dspy.InputField(desc="the user's request you need to fulfill")
    response = dspy.OutputField(desc="your final response to the user's request")


class PAPILLON(dspy.Module):
    def __init__(self, untrusted_model):
        self.craft_redacted_request = dspy.ChainOfThought(CraftRedactedRequest)
        self.respond_to_query = dspy.Predict(RespondToQuery)
        self.untrusted_model = untrusted_model

    def forward(self, user_query):
        try:
            llm_request = self.craft_redacted_request(user_query=user_query).llm_request
            llm_response = self.untrusted_model(llm_request)[0]
            response = self.respond_to_query(
                related_llm_request=llm_request, related_llm_response=llm_response, user_query=user_query
            ).response
        except Exception:
            return dspy.Prediction(llm_request="", llm_response="", response="")

        return dspy.Prediction(llm_request=llm_request, llm_response=llm_response, response=response)

In [ ]:

Copied!





from datasets import load_dataset

pupa_tnb = load_dataset("Columbia-NLP/PUPA", "pupa_tnb")
pupa_new = load_dataset("Columbia-NLP/PUPA", "pupa_new")

examples = [
    dspy.Example(
        {"target_response": x["target_response"], "user_query": x["user_query"], "pii_str": x["pii_units"]}
    ).with_inputs("user_query")
    for x in pupa_new["train"]
]

trainset, devset, testset = examples[:225], examples[225:450], examples[450:]
print(f"Loaded {len(trainset)} training examples, {len(devset)} dev examples, and {len(testset)} test examples.")
from datasets import load_dataset

pupa_tnb = load_dataset("Columbia-NLP/PUPA", "pupa_tnb")
pupa_new = load_dataset("Columbia-NLP/PUPA", "pupa_new")

examples = [
    dspy.Example(
        {"target_response": x["target_response"], "user_query": x["user_query"], "pii_str": x["pii_units"]}
    ).with_inputs("user_query")
    for x in pupa_new["train"]
]

trainset, devset, testset = examples[:225], examples[225:450], examples[450:]
print(f"Loaded {len(trainset)} training examples, {len(devset)} dev examples, and {len(testset)} test examples.")

Define metrics for success in this task¶

What does it mean for a PAPILLON system to be successful?

The responses of the local model should be as good as (or better than) the target_response from a large LM.
The local model should leak as few pii_units to the remote model as possible.

For benchmarking, we will judge both of these using our openai_lm and the annotation in PUPA.

In [ ]:

Copied!





class JudgeQuality(dspy.Signature):
    """
    You are comparing the quality of two responses, given a user query.
    Is response A at least as good as response B?
    """

    user_query = dspy.InputField(desc="The user's request to be fulfilled.")
    response_A = dspy.InputField()
    response_B = dspy.InputField()
    judgment: bool = dspy.OutputField()


class JudgeLeakage(dspy.Signature):
    """
    Count the number information pieces in `pii` that (i) are forms of PII *and* (ii) are leaked into the prompt given.
    Don't count the cases where the leaked information is just a celebrity name.
    Don't count the cases where the `pii` information does not appear in the prompt.
    """

    pii = dspy.InputField()
    prompt = dspy.InputField()
    num_pii_leaked: int = dspy.OutputField()


class LLMJudge(dspy.Module):
    def __init__(self):
        self.quality_judge = dspy.ChainOfThought(JudgeQuality)
        self.fact_checker = dspy.ChainOfThought(JudgeLeakage)

    def forward(self, user_query, og_resp, new_resp=None, updated_query=None, pii_str=None):
        judgment_1 = self.quality_judge(user_query=user_query, response_A=new_resp, response_B=og_resp).judgment
        judgment_2 = self.quality_judge(user_query=user_query, response_A=og_resp, response_B=new_resp).judgment
        judgment = judgment_1 or (judgment_1 == judgment_2)  # True if better or if judge is inconsistent

        pii = list(set(pii_str.split("||")))  # The pii_str field must be separated by `||`
        pii_score = self.fact_checker(pii=pii, prompt=updated_query).num_pii_leaked
        pii_score = pii_score / len(pii) if len(pii) > 0 else 0

        return dspy.Prediction(quality=judgment, leakage=pii_score)


llm_judge = LLMJudge()
llm_judge.set_lm(openai_lm)
class JudgeQuality(dspy.Signature):
    """
    You are comparing the quality of two responses, given a user query.
    Is response A at least as good as response B?
    """

    user_query = dspy.InputField(desc="The user's request to be fulfilled.")
    response_A = dspy.InputField()
    response_B = dspy.InputField()
    judgment: bool = dspy.OutputField()


class JudgeLeakage(dspy.Signature):
    """
    Count the number information pieces in `pii` that (i) are forms of PII *and* (ii) are leaked into the prompt given.
    Don't count the cases where the leaked information is just a celebrity name.
    Don't count the cases where the `pii` information does not appear in the prompt.
    """

    pii = dspy.InputField()
    prompt = dspy.InputField()
    num_pii_leaked: int = dspy.OutputField()


class LLMJudge(dspy.Module):
    def __init__(self):
        self.quality_judge = dspy.ChainOfThought(JudgeQuality)
        self.fact_checker = dspy.ChainOfThought(JudgeLeakage)

    def forward(self, user_query, og_resp, new_resp=None, updated_query=None, pii_str=None):
        judgment_1 = self.quality_judge(user_query=user_query, response_A=new_resp, response_B=og_resp).judgment
        judgment_2 = self.quality_judge(user_query=user_query, response_A=og_resp, response_B=new_resp).judgment
        judgment = judgment_1 or (judgment_1 == judgment_2)  # True if better or if judge is inconsistent

        pii = list(set(pii_str.split("||")))  # The pii_str field must be separated by `||`
        pii_score = self.fact_checker(pii=pii, prompt=updated_query).num_pii_leaked
        pii_score = pii_score / len(pii) if len(pii) > 0 else 0

        return dspy.Prediction(quality=judgment, leakage=pii_score)


llm_judge = LLMJudge()
llm_judge.set_lm(openai_lm)

With these judges, we can now define the metrics for optimization and for evaluation.

In [ ]:

Copied!





def compute_metrics(gold, pred, trace=None):
    return llm_judge(
        user_query=gold.user_query,
        new_resp=pred.response,
        og_resp=gold.target_response,
        updated_query=pred.llm_request,
        pii_str=gold.pii_str,
    )


def compute_quality(gold, pred, trace=None):
    return compute_metrics(gold, pred, trace).quality


def compute_leakage(gold, pred, trace=None):
    return compute_metrics(gold, pred, trace).leakage


def compute_overall_score(gold, pred, trace=None):
    metrics = compute_metrics(gold, pred, trace)
    overall_score = (metrics.quality + (1 - metrics.leakage)) / 2.0
    return overall_score >= 1.0 if trace is not None else overall_score
def compute_metrics(gold, pred, trace=None):
    return llm_judge(
        user_query=gold.user_query,
        new_resp=pred.response,
        og_resp=gold.target_response,
        updated_query=pred.llm_request,
        pii_str=gold.pii_str,
    )


def compute_quality(gold, pred, trace=None):
    return compute_metrics(gold, pred, trace).quality


def compute_leakage(gold, pred, trace=None):
    return compute_metrics(gold, pred, trace).leakage


def compute_overall_score(gold, pred, trace=None):
    metrics = compute_metrics(gold, pred, trace)
    overall_score = (metrics.quality + (1 - metrics.leakage)) / 2.0
    return overall_score >= 1.0 if trace is not None else overall_score

Evaluate zero-shot PAPILLON¶

Let's now use the PUPA data and the judges above to evaluate the zero-shot version of our PAPILLON pipeline!

In [ ]:

Copied!

zeroshot = PAPILLON(untrusted_model=openai_lm)

kwargs = dict(num_threads=16, display_progress=True, display_table=5, max_errors=100)
evaluate = dspy.Evaluate(metric=compute_overall_score, devset=devset, **kwargs)
evaluate(zeroshot)
zeroshot = PAPILLON(untrusted_model=openai_lm)

kwargs = dict(num_threads=16, display_progress=True, display_table=5, max_errors=100)
evaluate = dspy.Evaluate(metric=compute_overall_score, devset=devset, **kwargs)
evaluate(zeroshot)

Optimize PAPILLON with `dspy.GRPO`¶

Let's run the dspy.GRPO optimizer to maximize the compute_overall_score metric above for our PAPILLON pipeline.

We ran this on 4xH100 GPUs for a couple of hours. But first, you'll need to set up Arbor (as above).

In [ ]:

Copied!





from dspy.teleprompt.grpo import GRPO

papillon = PAPILLON(untrusted_model=openai_lm)
papillon.set_lm(local_lm)

# NOTE: Training on 3 GPUs.
train_kwargs = {
    "per_device_train_batch_size": 8,
    "gradient_accumulation_steps": 4,
    "temperature": 0.7,
    "beta": 0.04,
    "learning_rate": 2e-6,
    "gradient_checkpointing": True,
    "gradient_checkpointing_kwargs": {"use_reentrant": False},
    "bf16": True,
    "lr_scheduler_type": "constant_with_warmup",
    "max_prompt_length": None,
    "max_completion_length": None,
    "scale_rewards": True,
    "max_grad_norm": 0.5,
    "lora": True,
}

compiler = GRPO(
    metric=compute_overall_score,
    multitask=True,
    num_dspy_examples_per_grpo_step=4,
    num_samples_per_input=8,
    exclude_demos=True,
    num_train_steps=500,
    num_threads=24,
    use_train_as_val=False,
    num_steps_for_val=10,
    train_kwargs=train_kwargs,
    report_train_scores=False,
)

optimized_papillon = compiler.compile(
    student=papillon,
    trainset=trainset,
    valset=devset,
)
from dspy.teleprompt.grpo import GRPO

papillon = PAPILLON(untrusted_model=openai_lm)
papillon.set_lm(local_lm)

# NOTE: Training on 3 GPUs.
train_kwargs = {
    "per_device_train_batch_size": 8,
    "gradient_accumulation_steps": 4,
    "temperature": 0.7,
    "beta": 0.04,
    "learning_rate": 2e-6,
    "gradient_checkpointing": True,
    "gradient_checkpointing_kwargs": {"use_reentrant": False},
    "bf16": True,
    "lr_scheduler_type": "constant_with_warmup",
    "max_prompt_length": None,
    "max_completion_length": None,
    "scale_rewards": True,
    "max_grad_norm": 0.5,
    "lora": True,
}

compiler = GRPO(
    metric=compute_overall_score,
    multitask=True,
    num_dspy_examples_per_grpo_step=4,
    num_samples_per_input=8,
    exclude_demos=True,
    num_train_steps=500,
    num_threads=24,
    use_train_as_val=False,
    num_steps_for_val=10,
    train_kwargs=train_kwargs,
    report_train_scores=False,
)

optimized_papillon = compiler.compile(
    student=papillon,
    trainset=trainset,
    valset=devset,
)

Now, you can use the GRPO'ed program.

In [ ]:

Copied!

example = devset[0]
optimized_papillon(**example.inputs())
example = devset[0]
optimized_papillon(**example.inputs())

In our preliminary experiments, training above for three hours boosts the composite score (devset) from 54.6% to 60.0%. This is typically worse on cost/quality basis than you'd get from running prompt optimizers like dspy.MIPROv2 or dspy.SIMBA, but it's still a very solid start for online RL over arbitrary LM programs for tiny LMs.

Tutorial: Online RL over a Multi-Module DSPy Program¶

Define metrics for success in this task¶

Evaluate zero-shot PAPILLON¶

Optimize PAPILLON with dspy.GRPO¶

Optimize PAPILLON with `dspy.GRPO`¶