@inproceedings{kaufmann2025responserank,author={Kaufmann, Timo and Metz, Yannick and Keim, Daniel and Hüllermeier, Eyke},booktitle={NeurIPS accepted, to be published.},title={ResponseRank: Data-Efficient Reward Modeling through Preference Strength Learning},year={2025}}
TMLR
A Survey of Reinforcement Learning from Human Feedback
Reinforcement learning from human feedback (RLHF) is a variant of reinforcement learning (RL) that learns from human feedback instead of relying on an engineered reward function. Building on prior work on the related setting of preference-based reinforcement learning (PbRL), it stands at the intersection of artificial intelligence and human-computer interaction. This positioning provides a promising approach to enhance the performance and adaptability of intelligent systems while also improving the alignment of their objectives with human values. The success in training large language models (LLMs) has impressively demonstrated this potential in recent years, where RLHF has played a decisive role in directing the model’s capabilities towards human objectives. This article provides an overview of the fundamentals of RLHF, exploring how RL agents interact with human feedback. While recent focus has been on RLHF for LLMs, our survey covers the technique across multiple domains. We provide our most comprehensive coverage in control and robotics, where many fundamental techniques originate, alongside a dedicated LLM section. We examine the core principles that underpin RLHF, how algorithms and human feedback work together, and discuss the main research trends in the field. Our goal is to give researchers and practitioners a clear understanding of this rapidly growing field.
@article{kaufmann2025survey,title={A {{Survey}} of {{Reinforcement Learning}} from {{Human Feedback}}},author={Kaufmann, Timo and Weng, Paul and Bengs, Viktor and H{\"u}llermeier, Eyke},year={2025},journal={Transactions on Machine Learning Research},issn={2835-8856},}
arXiv
Feedback Forensics: A Toolkit to Measure AI Personality
Some traits making a "good" AI model are hard to describe upfront. For example, should responses be more polite or more casual? Such traits are sometimes summarized as model character or personality. Without a clear objective, conventional benchmarks based on automatic validation struggle to measure such traits. Evaluation methods using human feedback such as Chatbot Arena have emerged as a popular alternative. These methods infer "better" personality and other desirable traits implicitly by ranking multiple model responses relative to each other. Recent issues with model releases highlight limitations of these existing opaque evaluation approaches: a major model was rolled back over sycophantic personality issues, models were observed overfitting to such feedback-based leaderboards. Despite these known issues, limited public tooling exists to explicitly evaluate model personality. We introduce Feedback Forensics: an open-source toolkit to track AI personality changes, both those encouraged by human (or AI) feedback, and those exhibited across AI models trained and evaluated on such feedback. Leveraging AI annotators, our toolkit enables investigating personality via Python API and browser app. We demonstrate the toolkit’s usefulness in two steps: (A) first we analyse the personality traits encouraged in popular human feedback datasets including Chatbot Arena, MultiPref and PRISM; and (B) then use our toolkit to analyse how much popular models exhibit such traits. We release (1) our Feedback Forensics toolkit alongside (2) a web app tracking AI personality in popular models and feedback datasets as well as (3) the underlying annotation data at https://github.com/rdnfn/feedback-forensics
@article{findeis2025feedback,title={Feedback {{Forensics}}: {{A Toolkit}} to {{Measure AI Personality}}},author={Findeis, Arduin and Kaufmann, Timo and H{\"u}llermeier, Eyke and Mullins, Robert},year={2025}}
ICLR
Inverse Constitutional AI: Compressing Preferences into Principles
Feedback data plays an important role in fine-tuning and evaluating state-of-the-art AI models. Often pairwise text preferences are used: given two texts, human (or AI) annotators select the "better" one. Such feedback data is widely used to align models to human preferences (e.g., reinforcement learning from human feedback), or to rank models according to human preferences (e.g., Chatbot Arena). Despite its wide-spread use, prior work has demonstrated that human-annotated pairwise text preference data often exhibits unintended biases. For example, human annotators have been shown to prefer assertive over truthful texts in certain contexts. Models trained or evaluated on this data may implicitly encode these biases in a manner hard to identify. In this paper, we formulate the interpretation of existing pairwise text preference data as a compression task: the Inverse Constitutional AI (ICAI) problem. In constitutional AI, a set of principles (or constitution) is used to provide feedback and fine-tune AI models. The ICAI problem inverts this process: given a dataset of feedback, we aim to extract a constitution that best enables a large language model (LLM) to reconstruct the original annotations. We propose a corresponding initial ICAI algorithm and validate its generated constitutions quantitatively based on reconstructed annotations. Generated constitutions have many potential use-cases – they may help identify undesirable biases, scale feedback to unseen data or assist with adapting LLMs to individual user preferences. We demonstrate our approach on a variety of datasets: (a) synthetic feedback datasets with known underlying principles; (b) the AlpacaEval dataset of cross-annotated human feedback; and (c) the crowdsourced Chatbot Arena data set. We release the code for our algorithm and experiments at https://github.com/rdnfn/icai.
@inproceedings{findeis2025inverse,title={Inverse {{Constitutional AI}}: {{Compressing Preferences}} into {{Principles}}},booktitle={Proceedings of the International Conference on Learning Representations ({{ICLR}})},shorttitle={Inverse {{Constitutional AI}}},author={Findeis, Arduin and Kaufmann, Timo and H{\"u}llermeier, Eyke and Albanie, Samuel and Mullins, Robert},year={2025}}
ICML
Comparing Comparisons: Informative and Easy Human Feedback with Distinguishability Queries
Xuening Feng, Zhaohui Jiang, Timo Kaufmann, and
3 more authors
In Proceedings of the International Conference on Machine Learning (ICML), 2025
Learning human objectives from preference feedback has significantly advanced reinforcement learning (RL) in domains where objectives are hard to formalize. However, traditional methods based on pairwise trajectory comparisons face notable challenges, including the difficulty in comparing trajectories with subtle differences and the limitation of conveying only ordinal information, limiting direct inference of preference strength. In this paper, we introduce a novel distinguishability query, enabling humans to express preference strength by comparing two pairs of trajectories. Labelers first indicate which of two pairs is easier to distinguish, then provide preference feedback only on the easier pair. Our proposed query type directly captures preference strength and is expected to reduce the cognitive load on the labeler. We further connect this query to cardinal utility and difference relations and develop an efficient query selection scheme to achieve a better trade-off between query informativeness and easiness. Experimental results demonstrate the potential of our method for faster, data-efficient learning and improved user-friendliness in RLHF benchmarks, particularly in classical control settings where preference strength is critical for expected utility maximization.
@inproceedings{feng2025comparing,title={Comparing {{Comparisons}}: {{Informative}} and {{Easy Human Feedback}} with {{Distinguishability Queries}}},shorttitle={Comparing {{Comparisons}}},booktitle={Proceedings of the {{International Conference}} on {{Machine Learning}} ({{ICML}})},author={Feng, Xuening and Jiang, Zhaohui and Kaufmann, Timo and H{\"u}llermeier, Eyke and Weng, Paul and Zhu, Yifei},year={2025}}
AAAI
DUO: Diverse, Uncertain, On-Policy Query Generation and Selection for Reinforcement Learning from Human Feedback
Xuening Feng, Zhaohui Jiang, Timo Kaufmann, and
4 more authors
In Proceedings of the AAAI Conference on Artificial Intelligence, 2025
Defining a reward function is usually a challenging but critical task for the system designer in reinforcement learning, especially when specifying complex behaviors. Reinforcement learning from human feedback (RLHF) emerges as a promising approach to circumvent this. In RLHF, the agent typically learns a reward function by querying a human teacher using pairwise comparisons of trajectory segments. A key question in this domain is how to reduce the number of queries necessary to learn an informative reward function since asking a human teacher too many queries is impractical and costly. To tackle this question, we propose DUO, a novel method for diverse, uncertain, on-policy query generation and selection in RLHF. Our method produces queries that are (1) more relevant for policy training (via an on-policy criterion), (2) more informative (via a principled measure of epistemic uncertainty), and (3) diverse (via a clustering-based filter). Experimental results on a variety of locomotion and robotic manipulation tasks demonstrate that our method can outperform state-of-the-art RLHF methods given the same total budget of queries, while being robust to possibly irrational teachers.
@inproceedings{feng2025duo,author={Feng, Xuening and Jiang, Zhaohui and Kaufmann, Timo and Xu, Puchen and Hüllermeier, Eyke and Weng, Paul and Zhu, Yifei},booktitle={Proceedings of the AAAI Conference on Artificial Intelligence},title={DUO: Diverse, Uncertain, On-Policy Query Generation and Selection for Reinforcement Learning from Human Feedback},year={2025},doi={10.1609/aaai.v39i16.33824}}
CL
Problem Solving Through Human-AI Preference-Based Cooperation
Subhabrata Dutta, Timo Kaufmann, Goran Glavaš, and
7 more authors
While there is a widespread belief that artificial general intelligence (AGI) – or even superhuman AI – is imminent, complex problems in expert domains are far from being solved. We argue that such problems require human-AI cooperation and that the current state of the art in generative AI is unable to play the role of a reliable partner due to a multitude of shortcomings, including difficulty in keeping track of a complex solution artifact (e.g., a software program), limited support for versatile human preference expression and lack of adapting to human preference in an interactive setting. To address these challenges, we propose HAI-Co2, a novel human-AI co-construction framework.We take first steps towards a formalization of HAI-Co2 and discuss the difficult open research problems that it faces.
@article{dutta2025problem,title={Problem {{Solving Through Human-AI Preference-Based Cooperation}}},author={Dutta, Subhabrata and Kaufmann, Timo and Glava{\v s}, Goran and Habernal, Ivan and Kersting, Kristian and Kreuter, Frauke and Mezini, Mira and Gurevych, Iryna and H{\"u}llermeier, Eyke and Sch{\"u}tze, Hinrich},year={2025},journal={Computational Linguistics},pages={1--35},doi={10.1162/coli.a.19}}
2024
MHFAIA
Comparing Comparisons: Informative and Easy Human Feedback with Distinguishability Queries
Xuening Feng, Zhaohui Jiang, Timo Kaufmann, and
3 more authors
In ICML 2024 Workshop on Models of Human Feedback for AI Alignment (MHFAIA), 2024
Learning human objectives from preference feedback has significantly advanced reinforcement learning (RL) in domains with hard-to-formalize objectives. Traditional methods with pairwise trajectory comparisons face challenges: trajectories with subtle differences are hard to compare, and comparisons are ordinal, limiting direct inference of preference strength. In this paper, we introduce the distinguishability query, where humans compare two pairs of trajectories and indicate which pair is easier to compare and then give preference feedback on the easier pair. This type of query directly infers preference strength and is expected to reduce cognitive load on the labeler. We also connect this query to cardinal utility and difference relations, and develop an efficient query selection scheme to achieve better trade-off between query informativeness and easiness. Experimental results empirically demonstrates the potential of our method for faster, data-efficient learning and improved user-friendliness on RLHF benchmarks.
@inproceedings{feng2024comparing,title={Comparing {{Comparisons}}: {{Informative}} and {{Easy Human Feedback}} with {{Distinguishability Queries}}},shorttitle={Comparing {{Comparisons}}},booktitle={{{ICML}} 2024 {{Workshop}} on {{Models}} of {{Human Feedback}} for {{AI Alignment}} ({{MHFAIA}})},author={Feng, Xuening and Jiang, Zhaohui and Kaufmann, Timo and H{\"u}llermeier, Eyke and Weng, Paul and Zhu, Yifei},year={2024}}
MHFAIA
Relatively Rational: Learning Utilities and Rationalities Jointly from Pairwise Preferences
Learning utilities from preference feedback has become increasingly important, particularly in fine-tuning language models such as ChatGPT. Traditional methods often assume equal rationality among labellers, leading to inaccurate utility estimates. We propose an algorithm that jointly estimates trainer rationality and item utilities to enhance utility learning and gain additional insights from feedback. Our approach focuses on settings where feedback is received from multiple trainers, using the Boltzmann-rational model to relate choices to latent utilities while accounting for varying levels of rationality. Given shared utilities, our method identifies rationality ratios among trainers from observed choices without extra calibration data or assumptions. We analyse the theoretical impact of assuming equal rationality on utility accuracy and empirically show superior performance in an action-advice setting, where agents construct policies using the learned utilities as rewards. By accurately modelling trainer rationality, we can enhance high-quality feedback collection, potentially leading to better-aligned models and an improved understanding of human preferences.
@inproceedings{yamagata2024relatively,title={Relatively {{Rational}}: {{Learning Utilities}} and {{Rationalities Jointly}} from {{Pairwise Preferences}}},shorttitle={Relatively {{Rational}}},booktitle={{{ICML}} 2024 {{Workshop}} on {{Models}} of {{Human Feedback}} for {{AI Alignment}} ({{MHFAIA}})},author={Yamagata, Taku and Oberkofler, Tobias and Kaufmann, Timo and Bengs, Viktor and H{\"u}llermeier, Eyke and {Santos-Rodriguez}, Raul},year={2024}}
RLBRew
OCALM: Object-Centric Assessment with Language Models
Properly defining a reward signal to efficiently train a reinforcement learning (RL) agent is a challenging task. Designing balanced objective functions from which a desired behavior can emerge requires expert knowledge, especially for complex environments. Learning rewards from human feedback or using large language models (LLMs) to directly provide rewards are promising alternatives, allowing non-experts to specify goals for the agent. However, black-box reward models make it difficult to debug the reward. In this work, we propose Object-Centric Assessment with Language Models (OCALM) to derive inherently interpretable reward functions for RL agents from natural language task descriptions. OCALM uses the extensive world-knowledge of LLMs while leveraging the object-centric nature common to many environments to derive reward functions focused on relational concepts, providing RL agents with the ability to derive policies from task descriptions.
@inproceedings{kaufmann2024ocalm,title={{{OCALM}}: {{Object-Centric Assessment}} with {{Language Models}}},booktitle={{{RLC}} 2024 {{Workshop}} on {{Reinforcement Learning Beyond Rewards}} ({{RLBRew}})},author={Kaufmann, Timo and Bl{\"u}ml, Jannis and W{\"u}st, Antonia and Delfosse, Quentin and Kersting, Kristian and H{\"u}llermeier, Eyke},year={2024}}
2023
HLDM
On the Challenges and Practices of Reinforcement Learning from Real Human Feedback
Reinforcement learning from human feedback (RLHF) is a variant of reinforcement learning (RL) that does not require an engineered reward function but instead learns from human feedback. Due to its increasing popularity, various authors have studied how to learn an accurate reward model from only few samples, making optimal use of this feedback. Because of the cost and complexity of user studies, however, this research is often conducted with synthetic human feedback. Such feedback can be generated by evaluating behavior based on ground-truth rewards which are available for some benchmark tasks. While this setting can help evaluate some aspects of RLHF, it differs from practical settings in which synthetic feedback is not available. Working with real human feedback brings additional challenges that cannot be observed with synthetic feedback, including fatigue, inter-rater inconsistencies, delay, misunderstandings, and modality-dependent difficulty. We describe and discuss some of these challenges together with current practices and opportunities for further research in this paper.
@inproceedings{kaufmann2023challenges,title={On the~{{Challenges}} and~{{Practices}} of~{{Reinforcement Learning}} from~{{Real Human Feedback}}},booktitle={Machine {{Learning}} and {{Principles}} and {{Practice}} of {{Knowledge Discovery}} in {{Databases}}},author={Kaufmann, Timo and Ball, Sarah and Beck, Jacob and H{\"u}llermeier, Eyke and Kreuter, Frauke},pages={276--294},editor={Meo, Rosa and Silvestri, Fabrizio},publisher={Springer Nature Switzerland},doi={10.1007/978-3-031-74627-7_21},year={2023}}
In this paper, we advocate for the potential of reinforcement learning from human feedback (RLHF) with self-supervised pretraining to increase the viability of reinforcement learning (RL) for real-world tasks, especially in the context of cyber-physical systems (CPS). We identify potential benefits of self-supervised pretraining in terms of the query sample complexity, safety, robustness, reward exploration and transfer. We believe that exploiting these benefits, combined with the generally improving sample efficiency of RL, will likely enable RL and RLHF to play an increasing role in CPS in the future.
@inproceedings{kaufmann2023reinforcement,title={Reinforcement {{Learning}} from~{{Human Feedback}} for~{{Cyber-Physical Systems}}: {{On}} the~{{Potential}} of~{{Self-Supervised Pretraining}}},booktitle={Proceedings of the {{International Conference}} on {{Machine Learning}} for {{Cyber-Physical Systems}} ({{ML4CPS}})},author={Kaufmann, Timo and Bengs, Viktor and H{\"u}llermeier, Eyke},year={2023},publisher={Springer Nature Switzerland},doi={10.1007/978-3-031-47062-2_2}}