diff --git a/claudette/helpers.py b/claudette/helpers.py index 80c0f65..35ed879 100644 --- a/claudette/helpers.py +++ b/claudette/helpers.py @@ -65,7 +65,7 @@ def build_xml(data, parent): # %% ../helpers.ipynb 23 doctype = namedtuple('doctype', ['source', 'content']) -# %% ../helpers.ipynb 26 +# %% ../helpers.ipynb 27 def mk_doctype(content:str, # The document content source:Optional[str]=None # URL, filename, etc; defaults to `md5(content)` if not provided ) -> namedtuple: diff --git a/helpers.ipynb b/helpers.ipynb index b07d468..5eb2c8e 100644 --- a/helpers.ipynb +++ b/helpers.ipynb @@ -348,7 +348,7 @@ }, { "cell_type": "markdown", - "id": "c0ba3e57", + "id": "7788c48c", "metadata": {}, "source": [ "## Including documents" @@ -356,7 +356,7 @@ }, { "cell_type": "markdown", - "id": "c696deff", + "id": "479be4c9", "metadata": {}, "source": [ "According [to Anthropic](https://docs.anthropic.com/claude/docs/long-context-window-tips), \"*it's essential to structure your prompts in a way that clearly separates the input data from the instructions*\". They recommend using the following format:\n", @@ -374,13 +374,15 @@ "\n", "\n", "\n", - "```" + "```\n", + "\n", + "We will create some small helper functions to make it easier to generate context in this format." ] }, { "cell_type": "code", "execution_count": null, - "id": "9e82a041", + "id": "a01dc320", "metadata": {}, "outputs": [], "source": [ @@ -390,7 +392,7 @@ }, { "cell_type": "markdown", - "id": "1e3749e9", + "id": "6620a123", "metadata": {}, "source": [ "We'll use `doctype` to store our pairs." @@ -399,7 +401,7 @@ { "cell_type": "code", "execution_count": null, - "id": "37562c12", + "id": "ce853491", "metadata": {}, "outputs": [], "source": [ @@ -410,10 +412,18 @@ " return s" ] }, + { + "cell_type": "markdown", + "id": "026d3b06", + "metadata": {}, + "source": [ + "Since Anthropic's example shows newlines before and after each tag, we'll do the same." + ] + }, { "cell_type": "code", "execution_count": null, - "id": "527606d0", + "id": "932e8858", "metadata": {}, "outputs": [], "source": [ @@ -426,10 +436,18 @@ " return doctype(add_nls(str(source).strip()), add_nls(content.strip()))" ] }, + { + "cell_type": "markdown", + "id": "8800921b", + "metadata": {}, + "source": [ + "This is a convenience wrapper to ensure that a `doctype` has the needed information in the right format." + ] + }, { "cell_type": "code", "execution_count": null, - "id": "e615b30e", + "id": "14f9e185", "metadata": {}, "outputs": [ { @@ -448,29 +466,10 @@ "mk_doctype(doc)" ] }, - { - "cell_type": "markdown", - "id": "8ae09c6d", - "metadata": {}, - "source": [ - "We'll use this sample:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c505a8f8", - "metadata": {}, - "outputs": [], - "source": [ - "docs = [doc, 'And another one']\n", - "sources = [None, 'doc.txt']" - ] - }, { "cell_type": "code", "execution_count": null, - "id": "34ec9a5a", + "id": "3b8e6f87", "metadata": {}, "outputs": [], "source": [ @@ -485,10 +484,18 @@ " return xt('document', [source, content], index=index)" ] }, + { + "cell_type": "markdown", + "id": "a8b6ac26", + "metadata": {}, + "source": [ + "We can now generate XML for one document in the suggested format:" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "fb34fe00", + "id": "e7ed5a9a", "metadata": {}, "outputs": [ { @@ -513,7 +520,7 @@ { "cell_type": "code", "execution_count": null, - "id": "601bc8df", + "id": "ba5ebfab", "metadata": {}, "outputs": [], "source": [ @@ -524,14 +531,22 @@ " \"Create an XML string containing `docs` in Anthropic's recommended format\"\n", " pre = 'Here are some documents for you to reference for your task:\\n\\n' if prefix else ''\n", " if sources is None: sources = [None]*len(docs)\n", - " docs = [mk_doc(i, *o) for i,o in enumerate(zip(docs,sources))]\n", + " docs = [mk_doc(i+1, *o) for i,o in enumerate(zip(docs,sources))]\n", " return pre + to_xml(xt('documents', docs))" ] }, + { + "cell_type": "markdown", + "id": "85004124", + "metadata": {}, + "source": [ + "Putting it all together, we have our final XML format:" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "c89cdaaa", + "id": "1dac60f6", "metadata": {}, "outputs": [ { @@ -541,7 +556,7 @@ "Here are some documents for you to reference for your task:\n", "\n", "\n", - "\n", + "\n", "\n", "b8898fab\n", "\n", @@ -549,7 +564,7 @@ "This is a sample\n", "\n", "\n", - "\n", + "\n", "\n", "doc.txt\n", "\n", @@ -562,12 +577,14 @@ } ], "source": [ + "docs = [doc, 'And another one']\n", + "sources = [None, 'doc.txt']\n", "print(docs_xml(docs, sources))" ] }, { "cell_type": "markdown", - "id": "ed712bc7", + "id": "2a8a7a9a", "metadata": {}, "source": [ "## Context creation" @@ -575,16 +592,32 @@ }, { "cell_type": "markdown", - "id": "96b8aebc", + "id": "cd06b2dc", + "metadata": {}, + "source": [ + "Now that we can generate Anthropic's XML format, let's make it easy for a few common cases." + ] + }, + { + "cell_type": "markdown", + "id": "65317fc6", "metadata": {}, "source": [ "### File list to context" ] }, + { + "cell_type": "markdown", + "id": "3778e8ed", + "metadata": {}, + "source": [ + "For generating XML context from files, we'll just read them as text and use the file names as `source`." + ] + }, { "cell_type": "code", "execution_count": null, - "id": "cde670d7", + "id": "0a168636", "metadata": {}, "outputs": [], "source": [ @@ -600,70 +633,115 @@ { "cell_type": "code", "execution_count": null, - "id": "553d6a35", + "id": "1bf73d36", "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Here are some documents for you to reference for your task:\n", - "\n", - "\n", - "\n", - "\n", - "samples/sample_core.py\n", - "\n", - "\n", - "__all__ = ['empty', 'models']\n", - "\n", - "import inspect\n", - "\n", - "empty = inspect.Parameter.empty\n", - "models = 'claude-3-opus-20240229','claude-3-sonnet-20240229','claude-3-haiku-20240307'\n", - "\n", - "\n", - "\n", - "\n", - "samples/sample_styles.css\n", - "\n", - "\n", - ".cell { margin-bottom: 1rem; }\n", - ".cell > .sourceCode { margin-bottom: 0; }\n", - ".cell-output > pre { margin-bottom: 0; }\n", - "\n", - "\n", - "\n" - ] + "data": { + "text/markdown": [ + "```xml\n", + "Here are some documents for you to reference for your task:\n", + "\n", + "\n", + "\n", + "\n", + "samples/sample_core.py\n", + "\n", + "\n", + "import inspect\n", + "empty = inspect.Parameter.empty\n", + "models = 'claude-3-opus-20240229','claude-3-sonnet-20240229','claude-3-haiku-20240307'\n", + "\n", + "\n", + "\n", + "\n", + "samples/sample_styles.css\n", + "\n", + "\n", + ".cell { margin-bottom: 1rem; }\n", + ".cell > .sourceCode { margin-bottom: 0; }\n", + ".cell-output > pre { margin-bottom: 0; }\n", + "\n", + "\n", + "\n", + "```" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ "fnames = ['samples/sample_core.py', 'samples/sample_styles.css']\n", - "print(files2ctx(fnames))" + "hl_md(files2ctx(fnames))" ] }, { "cell_type": "markdown", - "id": "a9073312", + "id": "191ddb2b", "metadata": {}, "source": [ "### Folder to context" ] }, { - "cell_type": "markdown", - "id": "546d4e3a", + "cell_type": "code", + "execution_count": null, + "id": "f4c5cb5a", + "metadata": {}, + "outputs": [], + "source": [ + "from fastcore.meta import delegates" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0452a21", "metadata": {}, + "outputs": [], "source": [ - "- Optional exclusion pattern" + "@delegates(globtastic)\n", + "def folder2ctx(\n", + " folder:Union[str,Path], # Folder name containing files to add to context\n", + " prefix:bool=True, # Include Anthropic's suggested prose intro?\n", + " **kwargs # Passed to `globtastic`\n", + ")->str: # XML for Claude context\n", + " fnames = globtastic(folder, **kwargs)\n", + " return files2ctx(fnames, prefix=prefix)" ] }, { - "cell_type": "markdown", - "id": "a22fe199", + "cell_type": "code", + "execution_count": null, + "id": "efd52392", "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n", + "samples/sample_core.py\n", + "\n", + "\n", + "import inspect\n", + "empty = inspect.Parameter.empty\n", + "models = 'claude-3-opus-20240229','claude-3-sonnet-20240229','claude-3-haiku-20240307'\n", + "\n", + "\n", + "\n" + ] + } + ], "source": [ - "### GitHub to context" + "print(folder2ctx('samples', prefix=False, file_glob='*.py'))" ] }, { diff --git a/samples/sample_core.py b/samples/sample_core.py index 44afa98..7f54af1 100644 --- a/samples/sample_core.py +++ b/samples/sample_core.py @@ -1,7 +1,3 @@ -__all__ = ['empty', 'models'] - import inspect - empty = inspect.Parameter.empty models = 'claude-3-opus-20240229','claude-3-sonnet-20240229','claude-3-haiku-20240307' -