Skip to content

Commit

Permalink
feat: Setup function (#42)
Browse files Browse the repository at this point in the history
* Added setup configuration

* Added the setup function
  • Loading branch information
MrtinoRG authored Jun 27, 2024
1 parent cac595d commit d335206
Show file tree
Hide file tree
Showing 12 changed files with 46 additions and 142 deletions.
43 changes: 3 additions & 40 deletions content/agents/agent.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -14,21 +14,6 @@
"This notebook aims to demonstrate how to construct LLM agents and explain their functioning. In this practical application, we will focus on extracting chemical reactions from an image.\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"tags": [
"remove-cell"
]
},
"outputs": [],
"source": [
"import warnings\n",
"\n",
"warnings.filterwarnings(\"ignore\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
Expand All @@ -54,7 +39,9 @@
"from langchain.agents.react.agent import create_react_agent\n",
"from langchain_openai import ChatOpenAI\n",
"\n",
"from litellm import completion"
"from litellm import completion\n",
"\n",
"import llmstructdata"
]
},
{
Expand All @@ -66,30 +53,6 @@
"```"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"tags": [
"remove-output"
]
},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"load_dotenv(\".env\", override=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down
6 changes: 3 additions & 3 deletions content/beyond_text/beyond_images.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@
},
"outputs": [],
"source": [
"import llmstructdata\n",
"\n",
"from pdf2image import convert_from_path\n",
"\n",
"file_path = \"../obtaining_data/PDFs/10.26434_chemrxiv-2024-1l0sn.pdf\"\n",
Expand Down Expand Up @@ -355,7 +357,7 @@
}
],
"source": [
"from dotenv import load_dotenv\n",
"import os\n",
"from litellm import completion\n",
"\n",
"\n",
Expand Down Expand Up @@ -398,8 +400,6 @@
"\n",
"\n",
"# Load the OpenAI API key from environment variables\n",
"dotenv_path = \"../.env\"\n",
"load_dotenv(dotenv_path)\n",
"api_key = os.getenv(\"OPENAI_API_KEY\")\n",
"\n",
"# Set the API key for LiteLLM\n",
Expand Down
3 changes: 1 addition & 2 deletions content/constrained_decoding/index.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,8 @@
"import instructor\n",
"from IPython.display import SVG\n",
"from openai import OpenAI\n",
"from dotenv import load_dotenv\n",
"\n",
"_ = load_dotenv(\"../.env\", override=True)"
"import llmstructdata"
]
},
{
Expand Down
2 changes: 2 additions & 0 deletions content/context_window/Dealing_with_context_window.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,8 @@
},
"outputs": [],
"source": [
"import llmstructdata\n",
"\n",
"from sentence_transformers import SentenceTransformer\n",
"\n",
"model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n",
Expand Down
2 changes: 2 additions & 0 deletions content/document_parsing_and_cleaning/cleaning.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@
}
],
"source": [
"import llmstructdata\n",
"\n",
"import re\n",
"\n",
"\n",
Expand Down
2 changes: 1 addition & 1 deletion content/document_parsing_and_cleaning/parsing.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
"outputs": [],
"source": [
"import os\n",
"\n",
"import llmstructdata\n",
"\n",
"def convert_pdf_with_nougat(\n",
" pdf_path, output_dir, model=\"0.1.0-small\", batch_size=1, no_skipping=False\n",
Expand Down
61 changes: 3 additions & 58 deletions content/finetune/choosing_paradigm.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -50,22 +50,6 @@
"We will start by importing all the packages needed."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "544ff2bd",
"metadata": {
"tags": [
"remove-cell"
]
},
"outputs": [],
"source": [
"import warnings\n",
"\n",
"warnings.filterwarnings(\"ignore\")"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down Expand Up @@ -101,8 +85,9 @@
"from litellm import completion\n",
"from litellm.caching import Cache\n",
"from statistics import mean\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt"
"import numpy as np \n",
"import matplotlib.pyplot as plt \n",
"import llmstructdata"
]
},
{
Expand All @@ -117,32 +102,6 @@
"```"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "94595c41",
"metadata": {
"tags": [
"remove-output"
]
},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"litellm.cache = Cache()\n",
"load_dotenv(\".env\", override=True)"
]
},
{
"cell_type": "markdown",
"id": "a7e82b75",
Expand Down Expand Up @@ -1926,20 +1885,6 @@
"To study the results more graphically, we can plot all the results in several bar plots."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "53cf6669",
"metadata": {
"tags": [
"remove-cell"
]
},
"outputs": [],
"source": [
"plt.style.use(\"../package/llmstructdata/book.mplstyle\")"
]
},
{
"cell_type": "code",
"execution_count": 26,
Expand Down
1 change: 1 addition & 0 deletions content/obtaining_data/crossref_search.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
"outputs": [],
"source": [
"from crossref.restful import Works\n",
"import llmstructdata\n",
"import json\n",
"\n",
"works = Works(timeout=60)\n",
Expand Down
1 change: 1 addition & 0 deletions content/obtaining_data/data_mining.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
"outputs": [],
"source": [
"from paperscraper.get_dumps import chemrxiv\n",
"import llmstructdata\n",
"\n",
"# Download of the ChemRxiv paper dump\n",
"chemrxiv(save_path=\"chemrxiv_2020-11-10.jsonl\")"
Expand Down
40 changes: 9 additions & 31 deletions content/perovskite/constrained_formulas.ipynb
Original file line number Diff line number Diff line change
@@ -1,20 +1,5 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"tags": [
"remove-cell"
]
},
"outputs": [],
"source": [
"import warnings\n",
"\n",
"warnings.filterwarnings(\"ignore\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand All @@ -26,15 +11,15 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"```{admonition} Motivation \n",
":class:note\n",
"````{admonition} Motivation \n",
":class: note\n",
"\n",
"In this notebook we want to collect a dataset on structured data that will allow us to train a composition-property prediction model later on. But we want to focus on a specific class of materials called **chalcogenide perovskites**.\n",
"\n",
"- We want to ensure that the output formulas retrieved can be parsed into a `pymatgen` `Composition` object. This is important to be able to reuse the extracted data with popular materials informatics packages, such as those leading in [MatBench](https://matbench.materialsproject.org/).\n",
"- We want to ensure control over the stoichiometry of the compounds retrieved. We also want to ensure that the formulas fulfill the criteria of charge neutrality. For this, we will employ a constrained decoding approach as exemplify in the [constrained decoding]() notebook.\n",
"- We will validate if the formulas retrieved fulfill some stability criteria for being a perovskite compound. For this we will employ a similar approach as in the [validation example]() notebook.\n",
"```"
"````"
]
},
{
Expand All @@ -43,8 +28,8 @@
"source": [
"## What are Chalcogenide Perovskites?\n",
"\n",
"```{admonition} Background\n",
":class:note\n",
"````{admonition} Background\n",
":class: note\n",
"\n",
"We aim to constrain our extraction pipeline to output formulas that adhere to specific chemical and structural criteria, focusing on **chalcogenide perovskite** compounds. \n",
"\n",
Expand All @@ -55,7 +40,7 @@
"- **Optoelectronic Properties**: They exhibit favorable band gaps and strong light absorption, making them ideal for photovoltaic applications.\n",
"- **Stability**: Compared to their halide counterparts, chalcogenide perovskites often show enhanced chemical stability.\n",
"- **Flexibility in Composition**: The ability to tune the composition allows for optimization of properties for specific applications like solar cells, photodetectors, and LEDs.\n",
"```"
"````"
]
},
{
Expand Down Expand Up @@ -84,7 +69,6 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"For this task, we want to retrieve the structure dataset from the review paper [Chalcogenide Perovskites: Tantalizing Prospects, Challenging Materials](https://onlinelibrary.wiley.com/doi/10.1002/adom.202101704).{cite}`Sopiha2021`"
]
},
Expand Down Expand Up @@ -934,6 +918,8 @@
}
],
"source": [
"import llmstructdata\n",
"\n",
"from pymatgen.core import Composition\n",
"\n",
"\n",
Expand Down Expand Up @@ -1074,10 +1060,7 @@
"from pydantic import BaseModel, Field\n",
"from typing import Optional, Literal, List\n",
"from litellm import completion\n",
"import instructor\n",
"from dotenv import load_dotenv\n",
"\n",
"_ = load_dotenv(\"../../.env\", override=True)"
"import instructor"
]
},
{
Expand Down Expand Up @@ -3456,11 +3439,6 @@
"\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
}
],
"metadata": {
Expand Down
9 changes: 7 additions & 2 deletions package/llmstructdata/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@

import matplotlib as mpl
import matplotlib.pyplot as plt
from dotenv import load_dotenv
import warnings
import pystow
import litellm
from litellm.caching import Cache

litellm.cache = Cache()

url = "https://raw.githubusercontent.com/lamalab-org/plotutils/main/kevin.mplstyle"
path = pystow.ensure("matstructdata", "plotsettings", url=url)

mpl.style.use(path)
load_dotenv("../../.env", override=True)

warnings.filterwarnings("ignore")
18 changes: 13 additions & 5 deletions references.bib
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,19 @@ @article{Ogasawara2015
}

@article{RxnScribe,
author = {Qian, Yujie and Guo, Jiang and Tu, Zhengkai and Coley, Connor W. and Barzilay, Regina},
title = {RxnScribe: A Sequence Generation Model for Reaction Diagram Parsing},
journal = {Journal of Chemical Information and Modeling},
doi = {10.1021/acs.jcim.3c00439}
}
title = {RxnScribe: A Sequence Generation Model for Reaction Diagram Parsing},
volume = {63},
ISSN = {1549-960X},
url = {http://dx.doi.org/10.1021/acs.jcim.3c00439},
DOI = {10.1021/acs.jcim.3c00439},
number = {13},
journal = {Journal of Chemical Information and Modeling},
publisher = {American Chemical Society (ACS)},
author = {Qian, Yujie and Guo, Jiang and Tu, Zhengkai and Coley, Connor W. and Barzilay, Regina},
year = {2023},
month = jun,
pages = {4030–4041}
}

@misc{shi2024instruction,
title={Instruction Tuning With Loss Over Instructions},
Expand Down

0 comments on commit d335206

Please sign in to comment.