From 9e46486128331f3d1f6b7c704ab812001bf47aff Mon Sep 17 00:00:00 2001 From: ncoop57 Date: Thu, 12 Sep 2024 00:31:48 +0200 Subject: [PATCH] chore: Refactor code to remove unnecessary conditional statement, add readme, and tiny programs example --- README.md | 69 +++- examples/tiny_programs.py | 222 ++++++++++ fastdata/core.py | 3 +- nbs/00_core.ipynb | 25 +- nbs/01_how_to.ipynb | 846 ++++++++++++++++++++++++++++++++++++++ nbs/index.ipynb | 138 +++++-- 6 files changed, 1231 insertions(+), 72 deletions(-) create mode 100644 examples/tiny_programs.py create mode 100644 nbs/01_how_to.ipynb diff --git a/README.md b/README.md index e1b3884..14f8811 100644 --- a/README.md +++ b/README.md @@ -3,9 +3,6 @@ -This file will become your README and also the index of your -documentation. - ## Developer Guide If you are new to using `nbdev` here are some useful pointers to get you @@ -58,10 +55,70 @@ find package manager specific guidelines on ## How to use -Fill me in please! Don’t forget code examples: +First you need to define the structure of the data you want to generate. +`instructor`, which is the library that fastdata uses to generate data, +requires you to define the schema of the data you want to generate. This +is done using pydantic models. + +``` python +from pydantic import BaseModel, Field + +class Translation(BaseModel): + english: str = Field(description="An english phrase") + german: str = Field(description="An equivalent german phrase that is a translation of the english phrase") +``` + +Next, you need to define the prompt that will be used to generate the +data and any inputs you want to pass to the prompt. + +``` python +prompt_template = """\ +Generate English and German translations on the following topic: +{topic} +""" + +inputs = [{"topic": "Otters are cute"}, {"topic": "I love programming"}] +``` + +Finally, we can generate some data with fastdata. + +> [!NOTE] +> +> We only support Anthropic models at the moment. Therefore, make sure +> you have an API key for the model you want to use and the proper +> environment variables set or pass the api key to the +> [`FastData`](https://AnswerDotAI.github.io/fastdata/core.html#fastdata) +> class `FastData(api_key="sk-ant-api03-...")`. ``` python -1+1 +from fastdata.core import FastData + +import pprint + +# Create a pretty printer object with custom settings +pp = pprint.PrettyPrinter(indent=4, width=100, compact=False) + +fast_data = FastData() +translations = fast_data.generate( + prompt_template=prompt_template, + inputs=inputs, + response_model=Translation, + model="claude-3-haiku-20240307" +) + +# Pretty print the translations +print("Translations:") +pp.pprint(translations) ``` - 2 + 100%|██████████| 2/2 [00:00<00:00, 2.21it/s] + + Translations: + [ {'english': 'Otters are cute', 'german': 'Otter sind süß'}, + {'english': 'I love programming', 'german': 'Ich liebe das Programmieren'}] + +If you’d like to see how best to generate data with fastdata, check out +our blog post [here](https://www.answer.ai/blog/introducing-fastdata) +and some of the examples in the +[examples](https://github.com/AnswerDotAI/fastdata/tree/main/examples) +directory. diff --git a/examples/tiny_programs.py b/examples/tiny_programs.py new file mode 100644 index 0000000..c4b953a --- /dev/null +++ b/examples/tiny_programs.py @@ -0,0 +1,222 @@ +from datasets import Dataset, load_dataset +from fastdata.core import FastData +from pydantic import BaseModel, Field +from typing import Literal + +class TinyProgram(BaseModel): + requirements: str = Field(description="A description of the requirements for the program to help the persona.") + code: str = Field(description="The code that satisfies the requirements. Ensure it is well written and documented.") + +class TranslationCritique(BaseModel): + critique: str = Field(description="A critique of the code.") + score: Literal[1, 2, 3, 4, 5] = Field(description="A score of the code from 1 to 5.") + +examples = [ + TinyProgram( + requirements="A Python-based data aggregation and analysis tool that scrapes key Salvadoran news websites and government portals for the latest political updates, election results, and policy changes. The program would use standard libraries like requests for web scraping, re for text parsing, and pandas for data manipulation. It would store the collected information in a structured format, perform basic sentiment analysis on news articles, and generate a daily summary report highlighting significant political events, trending topics, and shifts in public opinion. The tool could also track mentions of key political figures and parties, providing a quick overview of their media presence and associated sentiments.", + code="""\ +```python +import requests +from bs4 import BeautifulSoup +import pandas as pd +from textblob import TextBlob +from collections import Counter +import datetime + +def scrape_news(url): + response = requests.get(url) + soup = BeautifulSoup(response.content, 'html.parser') + articles = soup.find_all('article', class_='article-item') + + news_data = [] + for article in articles: + title = article.find('h2', class_='article-title').text.strip() + summary = article.find('p', class_='article-summary').text.strip() + news_data.append({'title': title, 'summary': summary}) + + return news_data + +def analyze_sentiment(text): + return TextBlob(text).sentiment.polarity + +def generate_report(data): + df = pd.DataFrame(data) + df['sentiment'] = df['summary'].apply(analyze_sentiment) + + # Calculate average sentiment + avg_sentiment = df['sentiment'].mean() + + # Find most mentioned words + all_words = ' '.join(df['title'] + ' ' + df['summary']).lower().split() + word_freq = Counter(word for word in all_words if len(word) > 3) + top_words = word_freq.most_common(5) + + # Generate report + report = f"Daily Political Analysis Report for El Salvador - {datetime.date.today()}\n\n" + report += f"Number of articles analyzed: {len(df)}\n" + report += f"Average sentiment: {'Positive' if avg_sentiment > 0 else 'Negative'} ({avg_sentiment:.2f})\n\n" + report += "Top mentioned words:\n" + for word, count in top_words: + report += f"- {word}: {count} times\n" + + report += "\nMost positive article:\n" + pos_article = df.loc[df['sentiment'].idxmax()] + report += f"Title: {pos_article['title']}\nSentiment: {pos_article['sentiment']:.2f}\n\n" + + report += "Most negative article:\n" + neg_article = df.loc[df['sentiment'].idxmin()] + report += f"Title: {neg_article['title']}\nSentiment: {neg_article['sentiment']:.2f}\n" + + return report + +def main(): + url = "https://www.elsalvador.com/noticias/nacional/" # Example Salvadoran news website + news_data = scrape_news(url) + report = generate_report(news_data) + print(report) + + # Optionally, save the report to a file + with open(f"el_salvador_political_report_{datetime.date.today()}.txt", "w") as f: + f.write(report) + +if __name__ == "__main__": + main() +``` +""" + ), + TinyProgram( + requirements="A \"Joke Personalizer\" program that takes a comedian's standard jokes as input and automatically modifies them to include inside references and shared memories from their school days. The program should use a dictionary of preset keywords (e.g., \"cafeteria\", \"Mr. Johnson's class\") and replace generic terms in the jokes with these personalized references. It should also have a \"nostalgia meter\" that rates how many childhood callbacks are in each joke, and a \"groan factor\" estimator based on the corniness of the puns. Bonus feature: a \"detention probability calculator\" that humorously predicts how likely the joke would have gotten them in trouble back in school.", + code="""\ +```python +import random +import re + +class JokePersonalizer: + def __init__(self): + self.inside_references = { + "restaurant": "cafeteria", + "teacher": "Mr. Johnson", + "friend": "Danny 'Braces' Smith", + "car": "rusty old bike", + "mall": "corner store", + "party": "detention", + "cool": "totally radical", + "phone": "Gameboy", + "computer": "TI-83 calculator", + "internet": "library encyclopedia" + } + self.pun_words = ["cheesy", "corny", "hilarious", "side-splitting", "knee-slapping"] + + def personalize_joke(self, joke): + for generic, personal in self.inside_references.items(): + joke = re.sub(r'\b' + generic + r'\b', personal, joke, flags=re.IGNORECASE) + return joke + + def nostalgia_meter(self, joke): + count = sum(1 for ref in self.inside_references.values() if ref.lower() in joke.lower()) + return min(count * 20, 100) # 20 points per reference, max 100 + + def groan_factor(self, joke): + pun_count = sum(1 for word in self.pun_words if word.lower() in joke.lower()) + return min(pun_count * 25, 100) # 25 points per pun word, max 100 + + def detention_probability(self, joke): + naughty_words = ["detention", "trouble", "principal's office", "suspended"] + probability = sum(10 for word in naughty_words if word.lower() in joke.lower()) + return min(probability, 100) # 10% per naughty word, max 100% + + def process_joke(self, original_joke): + personalized_joke = self.personalize_joke(original_joke) + nostalgia = self.nostalgia_meter(personalized_joke) + groan = self.groan_factor(personalized_joke) + detention_prob = self.detention_probability(personalized_joke) + + return { + "original": original_joke, + "personalized": personalized_joke, + "nostalgia_rating": nostalgia, + "groan_factor": groan, + "detention_probability": detention_prob + } + +# Example usage +personalizer = JokePersonalizer() + +jokes = [ + "I went to a restaurant last night and had the best meal ever!", + "My teacher asked me to stay after class, it was so cool!", + "I threw a party and nobody came. It was a real phone-y situation!", +] + +for joke in jokes: + result = personalizer.process_joke(joke) + print(f"Original: {result['original']}") + print(f"Personalized: {result['personalized']}") + print(f"Nostalgia Rating: {result['nostalgia_rating']}%") + print(f"Groan Factor: {result['groan_factor']}%") + print(f"Detention Probability: {result['detention_probability']}%") + print() +``` +""" + ), +] +examples = "\n".join(f"- {example}" for example in examples) + +# Load personas +personas = load_dataset("proj-persona/PersonaHub", "persona", split='train').select(range(1_000))['persona'] + +prompt_template = """\ +Here are some examples: +{examples} + +Create requirements and the python program that satisfies them for the following persona: {persona} +""" + +# Generate tiny programs +fast_data = FastData() +tiny_programs = fast_data.generate( + prompt_template=prompt_template, + inputs=[{"persona": persona, "examples": examples} for persona in personas], + response_model=TinyProgram, + model="claude-3-haiku-20240307" +) +# remove Nones +tiny_programs = [t for t in tiny_programs if t is not None] + +critique_template = """\ +Below is a code snippet. Evaluate its educational value for teaching programming to beginners in this language, using the additive 5-point scoring system described below. Points are accumulated based on the satisfaction of each criterion: + +- Add 1 point if the code is syntactically correct and runs without errors, providing a basic example of working code in the language. +- Add another point if the code demonstrates fundamental programming concepts (e.g., variables, control structures, functions) in a straightforward manner, even if it's not optimized or doesn't follow all best practices. +- Award a third point if the code is well-commented, explaining key concepts and the purpose of different code sections. It should be readable and illustrate good naming conventions, making it easier for beginners to understand. +- Grant a fourth point if the code showcases language-specific features or common programming patterns in an accessible way. It should provide clear examples of how to apply these concepts practically. +- Bestow a fifth point if the code is an exemplary teaching tool, striking an excellent balance between simplicity and real-world applicability. It should inspire further learning, possibly including deliberate mistakes or opportunities for improvement that a teacher could use as discussion points. + +The code snippet: +```python +{code} +``` + +After examining the code: + +- Briefly justify your total score, up to 100 words, focusing on its effectiveness as a teaching tool for beginners. +- Conclude with the score. +""" + +critiques = fast_data.generate( + prompt_template=critique_template, + inputs=[{"code": f"{t['code']}"} for t in tiny_programs], + response_model=TranslationCritique, + model="claude-3-5-sonnet-20240620" +) + +# Update tiny_programs with critiques +for program, critique in zip(tiny_programs, critiques): + if program is None or critique is None: + continue + program['critique'] = critique['critique'] + program['score'] = critique['score'] + + +ds = Dataset.from_list(tiny_programs) +ds.push_to_hub("answerdotai/tiny_programs", private=True) \ No newline at end of file diff --git a/fastdata/core.py b/fastdata/core.py index d15fe16..6cebbf3 100644 --- a/fastdata/core.py +++ b/fastdata/core.py @@ -59,7 +59,6 @@ def process_input(input_data): futures = [executor.submit(process_input, input_data) for input_data in inputs] for future in tqdm(concurrent.futures.as_completed(futures), total=len(inputs)): result = future.result() - if result: - results.append(result) + results.append(result) return results diff --git a/nbs/00_core.ipynb b/nbs/00_core.ipynb index 67a81f8..eec7c29 100644 --- a/nbs/00_core.ipynb +++ b/nbs/00_core.ipynb @@ -30,7 +30,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -45,7 +45,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -95,15 +95,14 @@ " futures = [executor.submit(process_input, input_data) for input_data in inputs]\n", " for future in tqdm(concurrent.futures.as_completed(futures), total=len(inputs)):\n", " result = future.result()\n", - " if result:\n", - " results.append(result)\n", + " results.append(result)\n", " \n", " return results" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -178,7 +177,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -284,7 +283,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -298,18 +297,6 @@ "display_name": "python3", "language": "python", "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" } }, "nbformat": 4, diff --git a/nbs/01_how_to.ipynb b/nbs/01_how_to.ipynb new file mode 100644 index 0000000..82c3572 --- /dev/null +++ b/nbs/01_how_to.ipynb @@ -0,0 +1,846 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#| hide\n", + "from fastdata.core import *" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# How To ~~Train~~ Synthesize Your ~~Dragon~~ Data\n", + "\n", + "> The fastest way to create high quality synthetic data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Introduction\n", + "\n", + "Synthetic data has become a popular topic in the field of large language models (LLMs) recently with modern LLMs such as Meta's Llama 3 models being trained on a large portion of synthetic data. This blog post attempts to introduce synthetic data generation and showcase the important bits to consider when generating synthetic data." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Synthetic Data Overview\n", + "\n", + "When we refer to synthetic data in this blog post, we are referring to data that is generated by an LLM. Synthetic data has a number of benefits over other types of data such as web scraped data. The first and foremost is the controllability of the data. We can specify the type of data we want to generate, targetting specific tasks or languages, levels of difficulty, and even specific topics. We can also easily generate large amounts of data in a shorter amount of time due to the parallelizable nature of LLMs compared to using an annotation service such as [scale.ai](scale.ai). However, there are also some downsides to synthetic data. The first is that it is not always clear how to define the quality of the data. The second is that it can be difficult to generate diverse data which covers the long tail distribution of real world knowledge. The third is that it can be difficult to ensure that the data is faithfully represents the real world, i.e., does not contain hallucinations. Additionally, there are some concerns that training on synthetic data can lead to model collapse. For example, the awesome paper [The Curse of Recursion: Training on Generated Data Makes Models Forget](https://arxiv.org/abs/2305.17493) showed that consecutively training a model on data generated from itself leads to the model forgetting the original training data and devolving into nonsense. This blog post will discuss and demonstrate many lessons learned in the field of synthetic data generation.\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Important Bits\n", + "\n", + "Two of the most important components behind data is quality and diversity. These two components sadly can be in conflict with each other. For example, a random string generator will give you a ton of diversity, but the quality will be low. On the other hand, looking at Encyclopedia Britannica, you will find high quality articles of various topics, but they will all be written in a similar style to each other and will lack depth on many topics such as mathematics or will not contain any content on others such as fan fictions of popular tv shows, movies and novels.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## How to Nail the Important Bits\n", + "\n", + "Let's start with diversity since it is the simplest, especially due to how LLMs work. LLMs at their core are a probability distribution of possible words given previous words in a sequence with words that often come after other words being more probable in this probability distribution. We can easily sample from and manipulate this distribution to cover as much of the language space as we want. A recent popular method to do this is to bias the distribution using topics or personas that lead the LLM to generate words in the direction of that topic or in the style of the given persona. This has a nice benefit of not degrading the quality of the generated text, which can happen with other methods such as increasing the temperature of a given sampling method as discussed in [The Curious Case of Neural Text Degeneration](https://arxiv.org/abs/1904.09751) by Holtzman et. al. Here is an example of the persona approach in action from the recently released paper [Scaling Synthetic Data Creation with 1,000,000,000 Personas](https://arxiv.org/abs/2406.20094v1) by Chan et. al. Now this sadly only solves one aspect of diversity, that of breath. There is however, a depth associated with diversity which I think is best expressed in the paper [WizardLM: Empowering Large Language Models to Follow Complex Instructions](https://arxiv.org/abs/2304.12244) by Xu et. al. In this paper, the authors evolve seed instructions in terms of breath, the type of topics used for an instruction, and depth, the complexity of the instruction.\n", + "\n", + "Great, diversity, check! How about quality? Unfortunately quality is a lot more fickled than diversity in this context. For example, high quality can refer to how well a particular text is written, it could refer to the accuracy of the information presented in the text (e.g., a mathematical proof), the readability of the text, etc. An analogy that might help encapsulate the issue is what happens with grading student exams. Usually, and especially for free form responses, the answer of the student is graded against a rubric that represents different characteristics to determine the quality of the answer. One method to try and improve quality is use personas where we specify a persona that is an expert in whatever data we seek to generate (e.g., `You are an expert senior level Python developer with deep knowledge of numpy and pandas`). However, this approach does not get around issues with hallucinations that LLMs are plagued with. Therefore, the approach I use most often is fixing in post (aka filtering). The idea is to initially generate a large diverse set of data and then find the bits and pieces that align with whatever notion (i.e., get a high score on your rubric) of quality you have for your data. For certain types of data such as code, we can use heuristics and simple filters such as whether the code compiles. However, for more abstract ideas of quality, we need to get more creative. One approach I quite like and have done a lot of work on is from the work [Textbooks Are All You Need](https://arxiv.org/abs/2306.11644) by Gunasekar et. al. In this work, the authors show how you can use a strong large language model to classify code files as high or low quality where they defined quality as the educational content of the file. Filtering a large collection of code data for only high quality educational code resulted in fairly sizable bumps in downstream performance. This LLM filtering technique has been utilized in a number of other works such as [How to Train Data-Efficient LLMs](https://arxiv.org/abs/2402.09668) and [The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale](https://arxiv.org/abs/2406.17557) to similar affects." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Let's Play with some Data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You don't need to take my word for these. Let's use this fun little library called [`instructor`](https://github.com/jxnl/instructor) to showcase some of these ideas. `instructor` is a library for forcing LLMs to generate data in a specific format such as a given JSON schema. For example, let's say we want to generate a dataset of english and german phrases. We can define a [`pydantic`](https://docs.pydantic.dev/latest/) model to represent the data and then use `instructor` to generate the data. Below is a code snippet that shows how to do this:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pydantic import BaseModel, Field\n", + "\n", + "class EnglishToGerman(BaseModel):\n", + " english: str = Field(description=\"An english phrase\")\n", + " german: str = Field(description=\"An equivalent german phrase that is a translation of the english phrase\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`instructor` supports many models and model providers. For this example, we'll use Anthropic:\n", + "\n", + "::: {.callout-note}\n", + "Make sure you have an API key for the model you want to use and the proper environment variables set. For example, if you are using Anthropic, you need to set the `ANTHROPIC_API_KEY` environment variable.\n", + ":::" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "english='I would like to create an English and German translation pair.' german=''\n", + "english='Hello, how are you today?' german='Hallo, wie geht es Ihnen heute?'\n", + "english='How are you doing today?' german='Wie geht es Ihnen heute?'\n", + "english='Hello, how are you today?' german='Hallo, wie geht es Ihnen heute?'\n", + "english='Hello, how are you today?' german='Hallo, wie geht es Ihnen heute?'\n" + ] + } + ], + "source": [ + "import anthropic\n", + "import instructor\n", + "\n", + "client = instructor.from_anthropic(anthropic.Anthropic())\n", + "for _ in range(5):\n", + " translation: EnglishToGerman = client.chat.completions.create(\n", + " model=\"claude-3-haiku-20240307\", # let's use the small, but mighty haiku model\n", + " max_tokens=512,\n", + " max_retries=0,\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": \"Create an english and german translation pair\",\n", + " }\n", + " ],\n", + " response_model=EnglishToGerman,\n", + " )\n", + " print(translation)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Eindrucksvoll! Looking at the output, we can see that the model is able to generate the data in the correct format. However, these pairs are quite simple and lack depth. Let's see if we can improve the quality of the generations by adding some examples. To do this, we will do some prompt engineering to give the model some examples that showcase the type of quality we want." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Create an english and german translation pair that is similar to the examples.\n", + "\n", + "Here are some examples:\n", + "- Hello, my name is Nathan. I am a research scientist at an AI startup. -> Hallo mein Name ist Nathan. Ich bin wissenschaftlicher Mitarbeiter bei einem KI-Startup.\n", + "- How much wood could a woodchuck chuck if a woodchuck could chuck wood? -> Wie viel Holz könnte ein Waldmurmeltier einspannen, wenn ein Waldmurmeltier Holz einspannen könnte?\n", + "- Thomas Cranmer (2 July 1489 - 21 March 1556) was a leader of the English Reformation and Archbishop of Canterbury during the reigns of Henry VIII, Edward VI and, for a short time, Mary I. He helped build the case for the annulment of Henry's marriage to Catherine of Aragon, which was one of the causes of the separation of the English Church from union with the Holy See. -> Thomas Cranmer (2. Juli 1489 - 21. März 1556) war ein Anführer der englischen Reformation und Erzbischof von Canterbury während der Herrschaft von Heinrich VIII., Eduard VI. und für kurze Zeit auch Maria I. Er half bei der Ausarbeitung der Klage für die Aufhebung von Heinrichs Heirat mit Katharina von Aragon, die eine der Ursachen für die Trennung der englischen Kirche von der Union mit dem Heiligen Stuhl war.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "english='The quick brown fox jumps over the lazy dog.' german='Der schnelle braune Fuchs springt über den faulen Hund.'\n", + "english='The quick brown fox jumps over the lazy dog.' german='Der schnelle braune Fuchs springt über den faulen Hund.'\n", + "english='The quick brown fox jumps over the lazy dog.' german='Der schnelle braune Fuchs springt über den faulen Hund.'\n", + "english='The new exhibit at the museum explores the history of ancient civilizations.' german='Die neue Ausstellung im Museum erforscht die Geschichte alter Zivilisationen.'\n", + "english='The quick brown fox jumps over the lazy dog.' german='Der schnelle braune Fuchs springt über den faulen Hund.'\n" + ] + } + ], + "source": [ + "examples = [\n", + " EnglishToGerman(\n", + " english=\"Hello, my name is Nathan. I am a research scientist at an AI startup.\",\n", + " german=\"Hallo mein Name ist Nathan. Ich bin wissenschaftlicher Mitarbeiter bei einem KI-Startup.\"),\n", + " EnglishToGerman(\n", + " english=\"How much wood could a woodchuck chuck if a woodchuck could chuck wood?\",\n", + " german=\"Wie viel Holz könnte ein Waldmurmeltier einspannen, wenn ein Waldmurmeltier Holz einspannen könnte?\"),\n", + " EnglishToGerman(\n", + " english=\"Thomas Cranmer (2 July 1489 - 21 March 1556) was a leader of the English Reformation and Archbishop of Canterbury during the reigns of Henry VIII, Edward VI and, for a short time, Mary I. He helped build the case for the annulment of Henry's marriage to Catherine of Aragon, which was one of the causes of the separation of the English Church from union with the Holy See.\",\n", + " german=\"Thomas Cranmer (2. Juli 1489 - 21. März 1556) war ein Anführer der englischen Reformation und Erzbischof von Canterbury während der Herrschaft von Heinrich VIII., Eduard VI. und für kurze Zeit auch Maria I. Er half bei der Ausarbeitung der Klage für die Aufhebung von Heinrichs Heirat mit Katharina von Aragon, die eine der Ursachen für die Trennung der englischen Kirche von der Union mit dem Heiligen Stuhl war.\"\n", + " ),\n", + "]\n", + "\n", + "prompt = \"\"\"\\\n", + "Create an english and german translation pair that is similar to the examples.\n", + "\n", + "Here are some examples:\n", + "- {examples}\n", + "\"\"\"\n", + "prompt = prompt.format(examples=\"\\n- \".join([f\"{e.english} -> {e.german}\" for e in examples]))\n", + "print(prompt)\n", + "\n", + "for _ in range(5):\n", + " translation: EnglishToGerman = client.chat.completions.create(\n", + " model=\"claude-3-haiku-20240307\", # let's use the small, but mighty haiku model\n", + " max_tokens=512,\n", + " max_retries=0,\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": prompt,\n", + " }\n", + " ],\n", + " response_model=EnglishToGerman,\n", + " )\n", + " print(translation)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Interesting! We are getting some better results, but a lot of duplicates. One thing that I discovered while prompting these models is that where you place your examples in the prompt can have a big impact on the quality of the generations. For example, if you place them at the end like the above does, the model will often repeat the examples in the generations. Let's try placing them at the beginning." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Here are some examples:\n", + "- Hello, my name is Nathan. I am a research scientist at an AI startup. -> Hallo mein Name ist Nathan. Ich bin wissenschaftlicher Mitarbeiter bei einem KI-Startup.\n", + "- How much wood could a woodchuck chuck if a woodchuck could chuck wood? -> Wie viel Holz könnte ein Waldmurmeltier einspannen, wenn ein Waldmurmeltier Holz einspannen könnte?\n", + "- Thomas Cranmer (2 July 1489 - 21 March 1556) was a leader of the English Reformation and Archbishop of Canterbury during the reigns of Henry VIII, Edward VI and, for a short time, Mary I. He helped build the case for the annulment of Henry's marriage to Catherine of Aragon, which was one of the causes of the separation of the English Church from union with the Holy See. -> Thomas Cranmer (2. Juli 1489 - 21. März 1556) war ein Anführer der englischen Reformation und Erzbischof von Canterbury während der Herrschaft von Heinrich VIII., Eduard VI. und für kurze Zeit auch Maria I. Er half bei der Ausarbeitung der Klage für die Aufhebung von Heinrichs Heirat mit Katharina von Aragon, die eine der Ursachen für die Trennung der englischen Kirche von der Union mit dem Heiligen Stuhl war.\n", + "\n", + "Create an english and german translation pair that is similar to the examples.\n", + "\n", + "english='The scientist conducted an experiment in the lab today.' german='Der Wissenschaftler führte heute ein Experiment im Labor durch.'\n", + "english='I work as a software engineer at a tech company in San Francisco. My team is working on developing a new machine learning model for natural language processing.' german='Ich arbeite als Softwareingenieur in einem Technologieunternehmen in San Francisco. Mein Team arbeitet an der Entwicklung eines neuen Maschinenlernmodells für die Verarbeitung natürlicher Sprache.'\n", + "english='The weather is beautiful today. Shall we go for a walk in the park?' german='Das Wetter ist heute wunderschön. Sollen wir einen Spaziergang im Park machen?'\n", + "english='My favorite animal is the giraffe. They are such fascinating creatures with their long necks and spots.' german='Mein Lieblingstier ist die Giraffe. Sie sind faszinierende Geschöpfe mit ihren langen Hälsen und Flecken.'\n", + "english='The quick brown fox jumps over the lazy dog.' german='Der flinke braune Fuchs springt über den faulen Hund.'\n" + ] + } + ], + "source": [ + "prompt = \"\"\"\\\n", + "Here are some examples:\n", + "- {examples}\n", + "\n", + "Create an english and german translation pair that is similar to the examples.\n", + "\"\"\"\n", + "prompt = prompt.format(examples=\"\\n- \".join([f\"{e.english} -> {e.german}\" for e in examples]))\n", + "print(prompt)\n", + "\n", + "for _ in range(5):\n", + " translation: EnglishToGerman = client.chat.completions.create(\n", + " model=\"claude-3-haiku-20240307\", # let's use the small, but mighty haiku model\n", + " max_tokens=512,\n", + " max_retries=0,\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": prompt,\n", + " }\n", + " ],\n", + " response_model=EnglishToGerman,\n", + " )\n", + " print(translation)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Woah, what a difference! Though we are still seeing that the model really likes foxes jumping over dogs. Let's see what happens if we focus on improving diversity instead of quality. To accomplish this, we will use a list of topics to guide the generations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "english='Otters are adorable aquatic mammals that live near rivers, lakes, and coastlines.' german='Otter sind entzückende Wassersäugetiere, die in der Nähe von Flüssen, Seen und Küstengebieten leben.'\n", + "english='Penguins are flightless birds that live in cold regions near the South Pole.' german='Pinguine sind flugunfähige Vögel, die in kalten Regionen in der Nähe des Südpols leben.'\n", + "english='Sloths are slow-moving tree-dwelling mammals found in Central and South America.' german='Faultiere sind langsam bewegende, baumlebende Säugetiere, die in Mittel- und Südamerika vorkommen.'\n", + "english='Cats are adorable furry companions that bring joy to many households.' german='Katzen sind entzückende pelzige Begleiter, die vielen Haushalten Freude bringen.'\n", + "english='Dogs are loyal and loving companions.' german='Hunde sind treue und liebevolle Begleiter.'\n" + ] + } + ], + "source": [ + "topics = [\"otters\", \"penguins\", \"sloths\", \"cats\", \"dogs\"]\n", + "for topic in topics:\n", + " prompt = \"\"\"\\\n", + " Create an english and german translation pair about the following topic:\n", + " {topic}\n", + " \"\"\"\n", + " prompt = prompt.format(topic=topic)\n", + " translation: EnglishToGerman = client.chat.completions.create(\n", + " model=\"claude-3-haiku-20240307\", # let's use the small, but mighty haiku model\n", + " max_tokens=512,\n", + " max_retries=0,\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": prompt,\n", + " }\n", + " ],\n", + " response_model=EnglishToGerman,\n", + " )\n", + " print(translation)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Okay nice, getting some diversity based on our list of topics. Also, since are using a relatively powerful model, the quality is pretty good. However, if we were to use a smaller model, we would likely see a drop in quality. Let's try combining our examples and topics trick together." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "english='Otters are semiaquatic mammals that belong to the weasel family. They have webbed feet and dense fur that helps them stay warm in the water. Otters are playful creatures and are known for their love of sliding down muddy banks into rivers and streams.' german='Otter sind halbaquatische Säugetiere, die zur Familie der Marder gehören. Sie haben Schwimmhäute zwischen den Zehen und ein dichtes Fell, das ihnen hilft, im Wasser warm zu bleiben. Otter sind verspielt und bekannt dafür, dass sie gerne an schlammigen Ufern in Flüsse und Bäche rutschen.'\n", + "english='Penguins are flightless birds that live in the Southern Hemisphere. They have black and white plumage and distinctive beaks.' german='Pinguine sind flugunfähige Vögel, die auf der Südhalbkugel leben. Sie haben ein schwarz-weißes Gefieder und auffällige Schnäbel.'\n", + "english='Sloths are slow-moving animals that live in the treetops of tropical forests in Central and South America. They have long limbs, sharp claws, and move at a leisurely pace, often spending most of the day sleeping.' german='Faultiere sind langsam bewegende Tiere, die in den Baumkronen der tropischen Wälder in Mittel- und Südamerika leben. Sie haben lange Gliedmaßen, scharfe Krallen und bewegen sich in gemächlichem Tempo, wobei sie oft den größten Teil des Tages schlafend verbringen.'\n", + "english=\"My cat loves to nap in the sun. She's such a lazy feline.\" german='Meine Katze liebt es, in der Sonne zu dösen. Sie ist eine so faule Katze.'\n", + "english='My dog Buddy loves to chase squirrels in the park. He is a very playful and energetic golden retriever.' german='Mein Hund Buddy liebt es, im Park Eichhörnchen zu jagen. Er ist ein sehr spielerischer und energiegeladener Golden Retriever.'\n" + ] + } + ], + "source": [ + "for topic in topics:\n", + " prompt = \"\"\"\\\n", + "Here are some examples:\n", + "- {examples}\n", + "\n", + "Create an english and german translation pair that is similar to the examples and is about the following topic:\n", + "{topic}\n", + " \"\"\"\n", + " prompt = prompt.format(examples=\"\\n- \".join([f\"{e.english} -> {e.german}\" for e in examples]), topic=topic)\n", + " translation: EnglishToGerman = client.chat.completions.create(\n", + " model=\"claude-3-haiku-20240307\", # let's use the small, but mighty haiku model\n", + " max_tokens=512,\n", + " max_retries=0,\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": prompt,\n", + " }\n", + " ],\n", + " response_model=EnglishToGerman,\n", + " )\n", + " print(translation)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Not too shabby and I'd say better than the previous examples. However, definitely needs work and has some interesting quirks such as the more exotic animals having phrases that are descriptions you might find in a wikipedia article where as the domesticated animals (cat and dog) have phrases of owners discussing them. Similar to what was discussed above, the order in which these examples and topics appear in the prompt can significantly impact the quality and diversity of the generations. Let's try swapping them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "english='Otters are playful and intelligent aquatic mammals. They have webbed feet, sleek fur, and can swim rapidly.' german='Otter sind verspielt und intelligente Säugetiere, die im Wasser leben. Sie haben geschwänzte Füße, ein glattes Fell und können schnell schwimmen.'\n", + "english='Penguins are fascinating birds that live in the coldest places on Earth.' german='Pinguine sind faszinierende Vögel, die in den kältesten Orten der Erde leben.'\n", + "english='Sloths are slow-moving mammals that live in the trees of Central and South America.' german='Faultiere sind langsam bewegende Säugetiere, die in den Bäumen Mittel- und Südamerikas leben.'\n", + "english='Cats are wonderful pets. They are very independent and playful animals.' german='Katzen sind wunderbare Haustiere. Sie sind sehr unabhängig und verspielt.'\n", + "english='My dog is very friendly. He loves to play fetch with me.' german='Mein Hund ist sehr freundlich. Er liebt es, Fangen mit mir zu spielen.'\n" + ] + } + ], + "source": [ + "for topic in topics:\n", + " prompt = \"\"\"\\\n", + "Create an english and german translation pair that is similar to the examples and is about the following topic:\n", + "{topic}\n", + "\n", + "Here are some examples:\n", + "- {examples}\n", + " \"\"\"\n", + " prompt = prompt.format(examples=\"\\n- \".join([f\"{e.english} -> {e.german}\" for e in examples]), topic=topic)\n", + " translation: EnglishToGerman = client.chat.completions.create(\n", + " model=\"claude-3-haiku-20240307\", # let's use the small, but mighty haiku model\n", + " max_tokens=512,\n", + " max_retries=0,\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": prompt,\n", + " }\n", + " ],\n", + " response_model=EnglishToGerman,\n", + " )\n", + " print(translation)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As you can see, the generations are already way shorter than the previous ones, most likely because the examples are shorter as well. Another way we can improve quality is to simply use a bigger model. Let's see what we get when using Claude 3.5 Sonnet." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "english='Otters are semi-aquatic mammals known for their playful behavior and their ability to use tools. They are found in rivers, lakes, and coastal areas around the world. These adorable creatures have thick, water-repellent fur that keeps them warm in cold waters.' german='Otter sind semi-aquatische Säugetiere, die für ihr verspieltes Verhalten und ihre Fähigkeit, Werkzeuge zu benutzen, bekannt sind. Sie kommen in Flüssen, Seen und Küstengebieten auf der ganzen Welt vor. Diese niedlichen Tiere haben ein dickes, wasserabweisendes Fell, das sie in kaltem Wasser warm hält.'\n", + "english='Penguins are flightless seabirds that are highly adapted for life in the water. They are found almost exclusively in the Southern Hemisphere, particularly in Antarctica. Despite their inability to fly, penguins are excellent swimmers and can dive to great depths in search of food.' german='Pinguine sind flugunfähige Seevögel, die hervorragend an das Leben im Wasser angepasst sind. Sie kommen fast ausschließlich auf der Südhalbkugel vor, insbesondere in der Antarktis. Trotz ihrer Flugunfähigkeit sind Pinguine ausgezeichnete Schwimmer und können auf der Suche nach Nahrung in große Tiefen tauchen.'\n", + "english='Sloths are fascinating creatures known for their slow movement and unique lifestyle. They spend most of their lives hanging upside down in the trees of tropical rainforests. Despite their sluggish appearance, sloths are excellent swimmers and can hold their breath underwater for up to 40 minutes.' german='Faultiere sind faszinierende Kreaturen, die für ihre langsame Bewegung und einzigartige Lebensweise bekannt sind. Sie verbringen den Großteil ihres Lebens kopfüber hängend in den Bäumen tropischer Regenwälder. Trotz ihres trägen Erscheinungsbildes sind Faultiere ausgezeichnete Schwimmer und können ihren Atem unter Wasser bis zu 40 Minuten lang anhalten.'\n", + "english='Cats are fascinating creatures that have been domesticated for thousands of years. They are known for their independent nature, playful behavior, and ability to form strong bonds with their human companions. Many people appreciate cats for their low-maintenance lifestyle and soothing purrs.' german='Katzen sind faszinierende Geschöpfe, die seit Jahrtausenden domestiziert wurden. Sie sind bekannt für ihre unabhängige Natur, ihr verspieltes Verhalten und ihre Fähigkeit, starke Bindungen zu ihren menschlichen Begleitern aufzubauen. Viele Menschen schätzen Katzen wegen ihres pflegeleichten Lebensstils und ihres beruhigenden Schnurrens.'\n", + "english=\"Dogs are often referred to as man's best friend, known for their loyalty, companionship, and diverse breeds ranging from tiny Chihuahuas to large Great Danes.\" german='Hunde werden oft als des Menschen bester Freund bezeichnet und sind bekannt für ihre Treue, Kameradschaft und vielfältige Rassen, die von winzigen Chihuahuas bis hin zu großen Doggen reichen.'\n" + ] + } + ], + "source": [ + "translations = []\n", + "for topic in topics:\n", + " prompt = \"\"\"\\\n", + "Here are some examples:\n", + "- {examples}\n", + "\n", + "Create an english and german translation pair that is similar to the examples and is about the following topic:\n", + "{topic}\n", + " \"\"\"\n", + " prompt = prompt.format(examples=\"\\n- \".join([f\"{e.english} -> {e.german}\" for e in examples]), topic=topic)\n", + " translation: EnglishToGerman = client.chat.completions.create(\n", + " model=\"claude-3-5-sonnet-20240620\", # let's use Anthropic's best model to date\n", + " max_tokens=512,\n", + " max_retries=0,\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": prompt,\n", + " }\n", + " ],\n", + " response_model=EnglishToGerman,\n", + " )\n", + " print(translation)\n", + " translations.append(translation)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Haiku already has some pretty good generations, so these don't too much of an improvement, but they are definitely more detailed and longer. Additionally, it does not suffer from having the cat and dog example being written in a different way from the others.\n", + "\n", + "Now, we can keep tweaking the prompt and examples to get the best results. However, one thing that will continue to be issues are hallucinations and other general quality issues. Therefore, let us see how we can implement a common post-processing step to clean up the generations by using another LLM to evaluate the generations. To do this, we will be following the same additive 5-point scoring system that was found to be the most effective in the wonderful paper [The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale](https://arxiv.org/abs/2306.08510). Additionally, we will have the model generate a written critique of the translation before scoring it, which should also help with accurately evaluating the generations. Luckily this is quite easy to do with `instructor` since the responses are autoregressive and therefore, we can construct our `pydantic` model in such a way that the model can generate the critique first and which will then be used to score the generations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pydantic import BaseModel, Field\n", + "from typing import Literal\n", + "\n", + "class TranslationCritique(BaseModel):\n", + " critique: str = Field(description=\"A critique of the translation.\")\n", + " score: Literal[0, 1, 2, 3, 4, 5] = Field(description=\"A score of the translation from 0 to 5.\")\n", + "\n", + "prompt = \"\"\"\\\n", + "Below is an extract of a translation. Evaluate its quality as a senior translator would, considering its suitability for professional use. Use the additive 5-point scoring system described below. Points are accumulated based on the satisfaction of each criterion:\n", + "\n", + "- Add 1 point if the translation conveys the basic meaning of the source text, even if it includes some minor errors or awkward phrasing.\n", + "- Add another point if the translation is generally accurate but lacks refinement in style or fails to capture some nuances of the original. It might use inconsistent terminology or have occasional lapses in register.\n", + "- Award a third point if the translation is appropriate for professional use and accurately conveys key concepts of the source text. It demonstrates good understanding of both languages, though it may not be flawless or could include some slight inconsistencies. It resembles the work of a competent translator but may have room for improvement in fluency or precision.\n", + "- Grant a fourth point if the translation is highly accurate and reads naturally in the target language, exhibiting a consistent and appropriate style. It could be similar to the work of an experienced translator, offering faithful rendering of content and tone, with minimal errors, and effectively handling complex concepts or cultural references. The result is coherent, well-expressed, and valuable for its intended purpose.\n", + "- Bestow a fifth point if the translation is outstanding, demonstrating mastery of both source and target languages. It captures subtle nuances, maintains the author's voice and intent, and reads as if it were originally written in the target language. The translator has made excellent choices in dealing with challenging elements like wordplay, idiomatic expressions, or culture-specific content.\n", + "\n", + "The translation extract: {translation}\n", + "\n", + "After examining the translation:\n", + "\n", + "- Briefly justify your total score, up to 100 words.\n", + "- Conclude with the score of the translation.\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "english='Otters are semi-aquatic mammals known for their playful behavior and their ability to use tools. They are found in rivers, lakes, and coastal areas around the world. These adorable creatures have thick, water-repellent fur that keeps them warm in cold waters.' german='Otter sind semi-aquatische Säugetiere, die für ihr verspieltes Verhalten und ihre Fähigkeit, Werkzeuge zu benutzen, bekannt sind. Sie kommen in Flüssen, Seen und Küstengebieten auf der ganzen Welt vor. Diese niedlichen Tiere haben ein dickes, wasserabweisendes Fell, das sie in kaltem Wasser warm hält.'\n", + "Critique: The German translation is excellent, earning all 5 points:\n", + "1. It accurately conveys the basic meaning.\n", + "2. It captures nuances and maintains consistency.\n", + "3. It's suitable for professional use with accurate key concepts.\n", + "4. It reads naturally with appropriate style and terminology.\n", + "5. It demonstrates mastery, capturing subtle nuances and maintaining the original tone.\n", + "\n", + "The translation precisely conveys all information from the source text, including scientific terms (semi-aquatic, mammals) and descriptive language (playful, adorable). It maintains the structure and flow of the original while sounding natural in German. The translator has made excellent word choices, such as \"niedlich\" for \"adorable\" and \"wasserabweisend\" for \"water-repellent,\" showing a deep understanding of both languages.\n", + "Score: 5\n", + "english='Penguins are flightless seabirds that are highly adapted for life in the water. They are found almost exclusively in the Southern Hemisphere, particularly in Antarctica. Despite their inability to fly, penguins are excellent swimmers and can dive to great depths in search of food.' german='Pinguine sind flugunfähige Seevögel, die hervorragend an das Leben im Wasser angepasst sind. Sie kommen fast ausschließlich auf der Südhalbkugel vor, insbesondere in der Antarktis. Trotz ihrer Flugunfähigkeit sind Pinguine ausgezeichnete Schwimmer und können auf der Suche nach Nahrung in große Tiefen tauchen.'\n", + "Critique: The German translation is excellent, deserving all 5 points. It accurately conveys the full meaning of the English text, maintaining both factual content and tone. The translator has chosen precise German equivalents for key terms (e.g., \"flightless seabirds\" as \"flugunfähige Seevögel\"). The text reads naturally in German, with appropriate sentence structure and idiomatic expressions. It captures nuances like \"highly adapted\" (\"hervorragend angepasst\") and maintains consistency in style and register throughout. The translation demonstrates mastery of both languages and would be suitable for professional or academic use without any modifications.\n", + "Score: 5\n", + "english='Sloths are fascinating creatures known for their slow movement and unique lifestyle. They spend most of their lives hanging upside down in the trees of tropical rainforests. Despite their sluggish appearance, sloths are excellent swimmers and can hold their breath underwater for up to 40 minutes.' german='Faultiere sind faszinierende Kreaturen, die für ihre langsame Bewegung und einzigartige Lebensweise bekannt sind. Sie verbringen den Großteil ihres Lebens kopfüber hängend in den Bäumen tropischer Regenwälder. Trotz ihres trägen Erscheinungsbildes sind Faultiere ausgezeichnete Schwimmer und können ihren Atem unter Wasser bis zu 40 Minuten lang anhalten.'\n", + "Critique: The German translation is of outstanding quality, deserving the full 5 points. It accurately conveys the meaning of the English text, maintaining both the informative tone and the structure of the original. The translator has chosen appropriate terminology (e.g., \"Faultiere\" for \"sloths\") and has accurately translated specific details (e.g., \"40 minutes\"). The German text reads naturally and fluently, as if it were originally written in German. It captures the fascination with sloths expressed in the English version and maintains the same level of detail and engagement. This translation demonstrates a mastery of both languages and would be suitable for professional use without any modifications.\n", + "Score: 5\n", + "english='Cats are fascinating creatures that have been domesticated for thousands of years. They are known for their independent nature, playful behavior, and ability to form strong bonds with their human companions. Many people appreciate cats for their low-maintenance lifestyle and soothing purrs.' german='Katzen sind faszinierende Geschöpfe, die seit Jahrtausenden domestiziert wurden. Sie sind bekannt für ihre unabhängige Natur, ihr verspieltes Verhalten und ihre Fähigkeit, starke Bindungen zu ihren menschlichen Begleitern aufzubauen. Viele Menschen schätzen Katzen wegen ihres pflegeleichten Lebensstils und ihres beruhigenden Schnurrens.'\n", + "Critique: This translation is outstanding and deserves the full 5 points. It accurately conveys the meaning of the original text, maintaining its tone and style. The German version reads naturally and fluently, as if originally written in German. Key concepts like \"domesticated,\" \"independent nature,\" and \"form strong bonds\" are translated precisely. The translator skillfully handles phrases like \"low-maintenance lifestyle\" (pflegeleichten Lebensstils) and \"soothing purrs\" (beruhigenden Schnurrens), demonstrating mastery of both languages and capturing subtle nuances. The result is a professional-quality translation that perfectly conveys the original message.\n", + "Score: 5\n", + "english=\"Dogs are often referred to as man's best friend, known for their loyalty, companionship, and diverse breeds ranging from tiny Chihuahuas to large Great Danes.\" german='Hunde werden oft als des Menschen bester Freund bezeichnet und sind bekannt für ihre Treue, Kameradschaft und vielfältige Rassen, die von winzigen Chihuahuas bis hin zu großen Doggen reichen.'\n", + "Critique: The German translation accurately conveys the full meaning of the English source text, demonstrating a high level of accuracy and natural flow in the target language. It maintains the tone and intent of the original, effectively translating key concepts such as \"man's best friend,\" \"loyalty,\" and \"companionship.\" The translation also accurately renders the range of dog breeds, from Chihuahuas to Great Danes (Doggen in German). The sentence structure is well-adapted to German, reading fluently and naturally. The translator has made excellent choices in vocabulary and phrasing, resulting in a translation that could easily be mistaken for an original German text. This translation meets all criteria for a top score, showcasing mastery of both languages and capturing all nuances of the original.\n", + "Score: 5\n" + ] + } + ], + "source": [ + "for translation in translations:\n", + " critique: TranslationCritique = client.chat.completions.create(\n", + " model=\"claude-3-5-sonnet-20240620\",\n", + " max_tokens=512,\n", + " max_retries=0,\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": prompt.format(translation=str(translation)),\n", + " }\n", + " ],\n", + " response_model=TranslationCritique,\n", + " )\n", + " print(translation)\n", + " print(\"Critique:\", critique.critique)\n", + " print(\"Score:\", critique.score)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As you can see, the model is able to generate a critique and score for each translation. Also, it seems quite fond of its own translations. Let's see what happens when we give it a terrible translation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "english='The city council meeting on climate change initiatives was contentious, with passionate arguments from both sides. Ultimately, the proposal for increased funding for renewable energy projects was approved by a narrow margin.' german='Die Stadt Rat Treffen auf Klima Änderung Initiativen war streitsüchtig, mit passioniert Argumente von beide Seiten. Ultimativ, die Proposal für gesteigert Geld für erneuerbar Energie Projekten war approved bei ein eng Margin.'\n", + "Critique: The translation conveys the basic meaning of the source text, but it has significant issues. While it communicates the general idea of a contentious city council meeting about climate change initiatives, the German translation is riddled with errors and awkward phrasing. It uses many English words directly (e.g., \"approved,\" \"ultimativ\") instead of their German equivalents. The sentence structure is unnatural, following English syntax too closely. Terminology is inconsistent and often incorrect (e.g., \"Stadt Rat\" instead of \"Stadtrat\"). The translation lacks fluency and precision, making it unsuitable for professional use without substantial revision. It appears to be the work of an inexperienced translator or possibly a rudimentary machine translation.\n", + "Score: 1\n" + ] + } + ], + "source": [ + "bad_translation = EnglishToGerman(\n", + " english=\"The city council meeting on climate change initiatives was contentious, with passionate arguments from both sides. Ultimately, the proposal for increased funding for renewable energy projects was approved by a narrow margin.\",\n", + " german=\"Die Stadt Rat Treffen auf Klima Änderung Initiativen war streitsüchtig, mit passioniert Argumente von beide Seiten. Ultimativ, die Proposal für gesteigert Geld für erneuerbar Energie Projekten war approved bei ein eng Margin.\"\n", + ")\n", + "\n", + "critique: TranslationCritique = client.chat.completions.create(\n", + " model=\"claude-3-5-sonnet-20240620\",\n", + " max_tokens=512,\n", + " max_retries=0,\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": prompt.format(translation=str(bad_translation)),\n", + " }\n", + " ],\n", + " response_model=TranslationCritique,\n", + ")\n", + "print(bad_translation)\n", + "print(\"Critique:\", critique.critique)\n", + "print(\"Score:\", critique.score)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Wunderbar! Now, we have a relatively sophisticated way of generated tons of high quality data. Let's put our newfound knowledge to the test. For this, let us train a coding language model (something close to my heart) on a bunch of synethetic programs. To interject diversity, we will be using a list of personas for the model to generate the programs for. Specifically, we will be using the PersonaHub dataset from the paper [Scaling Synthetic Data Creation with 1,000,000,000 Personas](https://arxiv.org/abs/2406.20094v1), which contains a subset of roughly 200k personas. Below is some of the bits that we will be using to generate the programs. However, we will be applying multiprocessing to speed up the generation process that makes the code a bit unwieldy. To see the full code, please see the repository [tiny_programs](https://github.com/AnswerDotAI/tiny_programs)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class TinyProgram(BaseModel):\n", + " requirements: str = Field(description=\"A description of the requirements for the program to help the persona.\")\n", + " code: str = Field(description=\"The code that satisfies the requirements. Ensure it is well written and documented.\")\n", + "\n", + "TinyProgram(\n", + " requirements=\"A Python-based data aggregation and analysis tool that scrapes key Salvadoran news websites and government portals for the latest political updates, election results, and policy changes. The program would use standard libraries like requests for web scraping, re for text parsing, and pandas for data manipulation. It would store the collected information in a structured format, perform basic sentiment analysis on news articles, and generate a daily summary report highlighting significant political events, trending topics, and shifts in public opinion. The tool could also track mentions of key political figures and parties, providing a quick overview of their media presence and associated sentiments.\",\n", + " code=\"\"\"\\\n", + "```python\n", + "import requests\n", + "from bs4 import BeautifulSoup\n", + "import pandas as pd\n", + "from textblob import TextBlob\n", + "from collections import Counter\n", + "import datetime\n", + "\n", + "def scrape_news(url):\n", + " response = requests.get(url)\n", + " soup = BeautifulSoup(response.content, 'html.parser')\n", + " articles = soup.find_all('article', class_='article-item')\n", + " \n", + " news_data = []\n", + " for article in articles:\n", + " title = article.find('h2', class_='article-title').text.strip()\n", + " summary = article.find('p', class_='article-summary').text.strip()\n", + " news_data.append({'title': title, 'summary': summary})\n", + " \n", + " return news_data\n", + "\n", + "def analyze_sentiment(text):\n", + " return TextBlob(text).sentiment.polarity\n", + "\n", + "def generate_report(data):\n", + " df = pd.DataFrame(data)\n", + " df['sentiment'] = df['summary'].apply(analyze_sentiment)\n", + " \n", + " # Calculate average sentiment\n", + " avg_sentiment = df['sentiment'].mean()\n", + " \n", + " # Find most mentioned words\n", + " all_words = ' '.join(df['title'] + ' ' + df['summary']).lower().split()\n", + " word_freq = Counter(word for word in all_words if len(word) > 3)\n", + " top_words = word_freq.most_common(5)\n", + " \n", + " # Generate report\n", + " report = f\"Daily Political Analysis Report for El Salvador - {datetime.date.today()}\\n\\n\"\n", + " report += f\"Number of articles analyzed: {len(df)}\\n\"\n", + " report += f\"Average sentiment: {'Positive' if avg_sentiment > 0 else 'Negative'} ({avg_sentiment:.2f})\\n\\n\"\n", + " report += \"Top mentioned words:\\n\"\n", + " for word, count in top_words:\n", + " report += f\"- {word}: {count} times\\n\"\n", + " \n", + " report += \"\\nMost positive article:\\n\"\n", + " pos_article = df.loc[df['sentiment'].idxmax()]\n", + " report += f\"Title: {pos_article['title']}\\nSentiment: {pos_article['sentiment']:.2f}\\n\\n\"\n", + " \n", + " report += \"Most negative article:\\n\"\n", + " neg_article = df.loc[df['sentiment'].idxmin()]\n", + " report += f\"Title: {neg_article['title']}\\nSentiment: {neg_article['sentiment']:.2f}\\n\"\n", + " \n", + " return report\n", + "\n", + "def main():\n", + " url = \"https://www.elsalvador.com/noticias/nacional/\" # Example Salvadoran news website\n", + " news_data = scrape_news(url)\n", + " report = generate_report(news_data)\n", + " print(report)\n", + " \n", + " # Optionally, save the report to a file\n", + " with open(f\"el_salvador_political_report_{datetime.date.today()}.txt\", \"w\") as f:\n", + " f.write(report)\n", + "\n", + "if __name__ == \"__main__\":\n", + " main()\n", + "```\n", + "\"\"\"\n", + ")\n", + "\n", + "prompt_template = \"\"\"\\\n", + "Here are some examples:\n", + "{examples}\n", + "\n", + "Create requirements and the python program that satisfies them for the following persona: {persona}\n", + "\"\"\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To evaluate the quality of the code, we will be using the following prompt:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "prompt_template = \"\"\"\\\n", + "Below is a code snippet. Evaluate its educational value for teaching programming to beginners in this language, using the additive 5-point scoring system described below. Points are accumulated based on the satisfaction of each criterion:\n", + "\n", + "- Add 1 point if the code is syntactically correct and runs without errors, providing a basic example of working code in the language.\n", + "- Add another point if the code demonstrates fundamental programming concepts (e.g., variables, control structures, functions) in a straightforward manner, even if it's not optimized or doesn't follow all best practices.\n", + "- Award a third point if the code is well-commented, explaining key concepts and the purpose of different code sections. It should be readable and illustrate good naming conventions, making it easier for beginners to understand.\n", + "- Grant a fourth point if the code showcases language-specific features or common programming patterns in an accessible way. It should provide clear examples of how to apply these concepts practically.\n", + "- Bestow a fifth point if the code is an exemplary teaching tool, striking an excellent balance between simplicity and real-world applicability. It should inspire further learning, possibly including deliberate mistakes or opportunities for improvement that a teacher could use as discussion points.\n", + "\n", + "The code snippet:\n", + "```python\n", + "{code}\n", + "```\n", + "\n", + "After examining the code:\n", + "\n", + "- Briefly justify your total score, up to 100 words, focusing on its effectiveness as a teaching tool for beginners.\n", + "- Conclude with the score.\n", + "\"\"\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "I've gone ahead and ran the above for 1,000 programs, which resulted in 992 programs that were properly generated and scored. Here is the distribution of the scores:\n", + "\n", + "| Score | Count |\n", + "|-------|-------|\n", + "| 1 | 25 |\n", + "| 2 | 117 |\n", + "| 3 | 96 |\n", + "| 4 | 256 |\n", + "| 5 | 498 |\n", + "\n", + "I went ahead and got the quality scores for 10,000 random programs from GitHub and they are distributed as follows:\n", + "\n", + "| Score | Count |\n", + "|-------|-------|\n", + "| 1 | 2239 |\n", + "| 2 | 5230 |\n", + "| 3 | 1545 |\n", + "| 4 | 618 |\n", + "| 5 | 236 |\n", + "\n", + "Now, let's train a model on these programs. Specifically, I want to compare the performance of a model trained on these programs to a model trained on the same dataset from the wild, aka GitHub. We will be using the SmolLM-360M model from Huggingface as the baseline and we will test out the following configurations all of which were trained on 5 epochs of roughly 1,000 programs:\n", + "0. Baseline model\n", + "1. Train on the 992 synthetic programs\n", + "2. Train on 992 random GitHub programs\n", + "3. Train on a mixture of 496 scored 4 and 5 synthetic programs and 496 random GitHub programs\n", + "4. Train on the 754 4 and 5 scored synthetic programs.\n", + "5. Train on the 754 4 and 5 scored GitHub programs to make it equal to the synthetic programs.\n", + "\n", + "To evaluate the performance of these models, we will be using the standard HumanEval benchmark, which is a collection of 164 programming questions that are designed to test the ability of a coding LLM to generate correct and efficient code. Here are the results!\n", + "\n", + "| Setup | pass@1 |\n", + "|---------|--------|\n", + "| Setup 0 | 11.6% | 0.11585365853658537\n", + "| Setup 1 | 09.1% | 0.09146341463414634\n", + "| Setup 2 | 11.0% | 0.10975609756097561\n", + "| Setup 3 | 09.8% | 0.0975609756097561\n", + "| Setup 4 | 12.2% | 0.12195121951219512\n", + "| Setup 5 | 08.5% | 0.08536585365853659\n", + "\n", + "### Key findings from the experiment:\n", + "\n", + "We find some interesting results from these experiments! The common theme is that training on synthetic data is better than training on random GitHub programs regardless of quality filtering as shown in Setup 1 and 2 and Setup 4 and 5. Also of note is that we are only able to improve over the baseline by a small margin when using high quality synthetic data as shown in Setup 4. All other setups degrade performance, especially Setup 5, which is training on only high quality GitHub programs and a bit surprising as much research has gone into showing high quality data is better for training. More investigation will be needed to see why this is the case, but one possibility is that the scoring system is not as good these GitHub programs as the synthetic programs or it could be due to a lack of diversity in the GitHub programs.\n", + "\n", + "Some homework I'd like for you to do is to try to replicate the experiment for on your own with your own task and experiment with scaling up the size of the dataset to see how it impacts the performance of the model trained on it. As always, please share your findings with the community and feel free to reach out for help!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Take Aways\n", + "\n", + "Alright, so what are the key takeaways I want you to leave with? The first is that both quality and diversity are very important aspects when it comes to synthetic data and can make or break models trained on this data. The second is that, imo, quality by far is harder than diversity due to its multi-dimensional nature especially for free form content. And lastly, I'd like you to take away that synthetic data is a great tool to go to when you don't have a lot of data for your task. It's cheap and fast to create and when done correctly can boost performance on your task.\n", + "\n", + "You can find all the code for this post in our minimal synthetic data repo [fastdata](https://github.com/AnswerDotAI/fastdata)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Some ramblings and interesting resources in this area\n", + "\n", + "The first paper that I remember reading on this topic was [Evolution through Large Models](https://arxiv.org/abs/2206.08896) by Lehman et. al. The problem they faced was generating a walker robot in the Sodarace domain that can move across a given terrain. These walker robots are defined in Python using a framework that was not seen by the model the authors used. To teach the model how to generate these walker robots python programs, they took a synthetic data approach. Since LLMs need a ton of data to learn, they first use a coding LLM to mutate existing programs in a genetic programming style to mutant/augment existing programs. They then finetune the LLM on these programs and use it to synthesize even more programs. They show that this process can be repeated to improve the models ability to generate walker robots.\n", + "\n", + "```python\n", + "from walk_creator import walker_creator\n", + "...\n", + "def make_walker():\n", + " wc = walker_creator()\n", + " # the main body is a square\n", + " sides = make_square(wc, 0, 0, 10, 10)\n", + " center = wc.add_joint(5, 5)\n", + " ...\n", + "```\n", + "\n", + "Another interesting paper from some of my colleagues is [Quality-Diversity through AI Feedback](https://arxiv.org/abs/2310.13032) by Bradley et. al. This was where I really started to understand the connection among quality-diversity, artificial life and synthetic data for LLMs. Until this paper, I didn't really know about the work on QD algorithms, which I recently learned was pioneered by [Joel Lehman and Kenneth O. Stanley](https://quality-diversity.github.io/) (of course, I should have known smh). Compared to the standard genetic programming approaches that are purely an optimization problem attempting to find a single most fit solution, QD attempts to find a diverse set of solution, each with a high level of quality/fitness. Okay, back to the paper, this paper in particular is interesting as it shows how they show you can apply QD to tasks that are not easily measured for fitness such as creative writing by leveraging feedback from LLMs.\n", + "\n", + "Other recommended resources:\n", + "* [The Curious Decline of Linguistic Diversity: Training Language Models on Synthetic Text](https://arxiv.org/abs/2311.09807)\n", + "* [How Bad is Training on Synthetic Data? A Statistical Analysis of Language Model Collapse](https://arxiv.org/abs/2406.17557)\n", + "* [The Curse of Recursion: Training on Generated Data Makes Models Forget](https://arxiv.org/abs/2305.17493)\n", + "* [Best Practices and Lessons Learned on Synthetic Data for Language Models](https://arxiv.org/abs/2404.07503)\n", + "* [WizardLM: Empowering Large Language Models to Follow Complex Instructions](https://arxiv.org/abs/2304.12244)\n", + "* [🍷 FineWeb: decanting the web for the finest text data at scale](https://huggingface.co/spaces/HuggingFaceFW/blogpost-fineweb-v1)\n", + "* [Textbooks Are All You Need](https://arxiv.org/abs/2306.11644)\n", + "* [Quality-Diversity through AI Feedback](https://arxiv.org/abs/2310.13032)\n", + "* [Nemotron-4 340B Technical Report](https://arxiv.org/abs/2406.11704)\n", + "* [Scaling Synthetic Data Creation with 1,000,000,000 Personas](https://arxiv.org/abs/2406.20094v1)\n", + "* [Quality-Diversity optimisation algorithms](https://quality-diversity.github.io/)\n", + "* [ICML 2019 Tutorial: Recent Advances in Population-Based Search for Deep Neural Networks](https://youtu.be/g6HiuEnbwJE?si=kdnEOFsrvwAyqei9)\n", + "* [How to Train Data-Efficient LLMs](https://arxiv.org/abs/2402.09668)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "python3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/nbs/index.ipynb b/nbs/index.ipynb index 43b79a0..a85556f 100644 --- a/nbs/index.ipynb +++ b/nbs/index.ipynb @@ -1,29 +1,14 @@ { "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#| hide\n", - "from fastdata.core import *" - ] - }, { "cell_type": "markdown", "metadata": {}, "source": [ "# fastdata\n", "\n", - "> Easiest and fastest way to 1B synthetic tokens" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This file will become your README and also the index of your documentation." + "> Easiest and fastest way to 1B synthetic tokens\n", + "\n", + "Minimalist library that wraps around `instructor` to make generating synthetic data easy." ] }, { @@ -137,7 +122,52 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Fill me in please! Don't forget code examples:" + "First you need to define the structure of the data you want to generate. `instructor`, which is the library that fastdata uses to generate data, requires you to define the schema of the data you want to generate. This is done using pydantic models." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pydantic import BaseModel, Field\n", + "\n", + "class Translation(BaseModel):\n", + " english: str = Field(description=\"An english phrase\")\n", + " german: str = Field(description=\"An equivalent german phrase that is a translation of the english phrase\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, you need to define the prompt that will be used to generate the data and any inputs you want to pass to the prompt." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "prompt_template = \"\"\"\\\n", + "Generate English and German translations on the following topic:\n", + "{topic}\n", + "\"\"\"\n", + "\n", + "inputs = [{\"topic\": \"Otters are cute\"}, {\"topic\": \"I love programming\"}]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we can generate some data with fastdata.\n", + "\n", + "::: {.callout-note}\n", + "We only support Anthropic models at the moment. Therefore, make sure you have an API key for the model you want to use and the proper environment variables set or pass the api key to the `FastData` class `FastData(api_key=\"sk-ant-api03-...\")`.\n", + ":::" ] }, { @@ -146,45 +176,63 @@ "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "2" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 2/2 [00:00<00:00, 2.21it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Translations:\n", + "[ {'english': 'Otters are cute', 'german': 'Otter sind süß'},\n", + " {'english': 'I love programming', 'german': 'Ich liebe das Programmieren'}]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] } ], "source": [ - "1+1" + "from fastdata.core import FastData\n", + "\n", + "import pprint\n", + "\n", + "# Create a pretty printer object with custom settings\n", + "pp = pprint.PrettyPrinter(indent=4, width=100, compact=False)\n", + "\n", + "fast_data = FastData()\n", + "translations = fast_data.generate(\n", + " prompt_template=prompt_template,\n", + " inputs=inputs,\n", + " response_model=Translation,\n", + " model=\"claude-3-haiku-20240307\"\n", + ")\n", + "\n", + "# Pretty print the translations\n", + "print(\"Translations:\")\n", + "pp.pprint(translations)" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], - "source": [] + "source": [ + "If you'd like to see how best to generate data with fastdata, check out our blog post [here](https://www.answer.ai/blog/introducing-fastdata) and some of the examples in the [examples](https://github.com/AnswerDotAI/fastdata/tree/main/examples) directory." + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "python3", "language": "python", "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.8" } }, "nbformat": 4,