diff --git a/examples/fireworks/.env.example b/examples/fireworks/.env.example new file mode 100644 index 00000000..ab200215 --- /dev/null +++ b/examples/fireworks/.env.example @@ -0,0 +1 @@ +FIREWORKS_APIKEY="your fireworks api key" diff --git a/examples/fireworks/csv_scraper_fireworks.py b/examples/fireworks/csv_scraper_fireworks.py new file mode 100644 index 00000000..b1d7526d --- /dev/null +++ b/examples/fireworks/csv_scraper_fireworks.py @@ -0,0 +1,63 @@ +""" +Basic example of scraping pipeline using CSVScraperGraph from CSV documents +""" + +import os +from dotenv import load_dotenv +import pandas as pd +from scrapegraphai.graphs import CSVScraperGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info +load_dotenv() + +# ************************************************ +# Read the CSV file +# ************************************************ + +FILE_NAME = "inputs/username.csv" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +text = pd.read_csv(file_path) + +# ************************************************ +# Define the configuration for the graph +# ************************************************ +fireworks_api_key = os.getenv("FIREWORKS_APIKEY") + +graph_config = { + "llm": { + "api_key": fireworks_api_key, + "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "verbose": True, + "headless": False, +} + +# ************************************************ +# Create the CSVScraperGraph instance and run it +# ************************************************ + +csv_scraper_graph = CSVScraperGraph( + prompt="List me all the last names", + source=str(text), # Pass the content of the file, not the file object + config=graph_config +) + +result = csv_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = csv_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json or csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/fireworks/csv_scraper_graph_multi_fireworks.py b/examples/fireworks/csv_scraper_graph_multi_fireworks.py new file mode 100644 index 00000000..81393d60 --- /dev/null +++ b/examples/fireworks/csv_scraper_graph_multi_fireworks.py @@ -0,0 +1,63 @@ +""" +Basic example of scraping pipeline using CSVScraperMultiGraph from CSV documents +""" + +import os +from dotenv import load_dotenv +import pandas as pd +from scrapegraphai.graphs import CSVScraperMultiGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info + +load_dotenv() +# ************************************************ +# Read the CSV file +# ************************************************ + +FILE_NAME = "inputs/username.csv" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +text = pd.read_csv(file_path) + +# ************************************************ +# Define the configuration for the graph +# ************************************************ +fireworks_api_key = os.getenv("FIREWORKS_APIKEY") + +graph_config = { + "llm": { + "api_key": fireworks_api_key, + "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "verbose": True, + "headless": False, +} + +# ************************************************ +# Create the CSVScraperMultiGraph instance and run it +# ************************************************ + +csv_scraper_graph = CSVScraperMultiGraph( + prompt="List me all the last names", + source=[str(text), str(text)], + config=graph_config +) + +result = csv_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = csv_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json or csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/fireworks/custom_graph_fireworks.py b/examples/fireworks/custom_graph_fireworks.py new file mode 100644 index 00000000..a02b774e --- /dev/null +++ b/examples/fireworks/custom_graph_fireworks.py @@ -0,0 +1,118 @@ +""" +Example of custom graph using existing nodes +""" + +import os +from dotenv import load_dotenv + +from langchain_openai import OpenAIEmbeddings +from scrapegraphai.models import OpenAI +from scrapegraphai.graphs import BaseGraph +from scrapegraphai.nodes import FetchNode, ParseNode, RAGNode, GenerateAnswerNode, RobotsNode +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +fireworks_api_key = os.getenv("FIREWORKS_APIKEY") + +graph_config = { + "llm": { + "api_key": fireworks_api_key, + "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "verbose": True, + "headless": False, +} + +# ************************************************ +# Define the graph nodes +# ************************************************ + +llm_model = OpenAI(graph_config["llm"]) +embedder = OpenAIEmbeddings(api_key=llm_model.openai_api_key) + +# define the nodes for the graph +robot_node = RobotsNode( + input="url", + output=["is_scrapable"], + node_config={ + "llm_model": llm_model, + "force_scraping": True, + "verbose": True, + } +) + +fetch_node = FetchNode( + input="url | local_dir", + output=["doc", "link_urls", "img_urls"], + node_config={ + "verbose": True, + "headless": True, + } +) +parse_node = ParseNode( + input="doc", + output=["parsed_doc"], + node_config={ + "chunk_size": 4096, + "verbose": True, + } +) +rag_node = RAGNode( + input="user_prompt & (parsed_doc | doc)", + output=["relevant_chunks"], + node_config={ + "llm_model": llm_model, + "embedder_model": embedder, + "verbose": True, + } +) +generate_answer_node = GenerateAnswerNode( + input="user_prompt & (relevant_chunks | parsed_doc | doc)", + output=["answer"], + node_config={ + "llm_model": llm_model, + "verbose": True, + } +) + +# ************************************************ +# Create the graph by defining the connections +# ************************************************ + +graph = BaseGraph( + nodes=[ + robot_node, + fetch_node, + parse_node, + rag_node, + generate_answer_node, + ], + edges=[ + (robot_node, fetch_node), + (fetch_node, parse_node), + (parse_node, rag_node), + (rag_node, generate_answer_node) + ], + entry_point=robot_node +) + +# ************************************************ +# Execute the graph +# ************************************************ + +result, execution_info = graph.execute({ + "user_prompt": "Describe the content", + "url": "https://example.com/" +}) + +# get the answer from the result +result = result.get("answer", "No answer found.") +print(result) diff --git a/examples/fireworks/deep_scraper_fireworks.py b/examples/fireworks/deep_scraper_fireworks.py new file mode 100644 index 00000000..67a80868 --- /dev/null +++ b/examples/fireworks/deep_scraper_fireworks.py @@ -0,0 +1,52 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import DeepScraperGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +fireworks_api_key = os.getenv("FIREWORKS_APIKEY") + +graph_config = { + "llm": { + "api_key": fireworks_api_key, + "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "verbose": True, + "max_depth": 1 +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +deep_scraper_graph = DeepScraperGraph( + prompt="List me all the job titles and detailed job description.", + # also accepts a string with the already downloaded HTML code + source="https://www.google.com/about/careers/applications/jobs/results/?location=Bangalore%20India", + config=graph_config +) + +result = deep_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = deep_scraper_graph.get_execution_info() +print(deep_scraper_graph.get_state("relevant_links")) +print(prettify_exec_info(graph_exec_info)) \ No newline at end of file diff --git a/examples/fireworks/inputs/books.xml b/examples/fireworks/inputs/books.xml new file mode 100644 index 00000000..e3d1fe87 --- /dev/null +++ b/examples/fireworks/inputs/books.xml @@ -0,0 +1,120 @@ + + + + Gambardella, Matthew + XML Developer's Guide + Computer + 44.95 + 2000-10-01 + An in-depth look at creating applications + with XML. + + + Ralls, Kim + Midnight Rain + Fantasy + 5.95 + 2000-12-16 + A former architect battles corporate zombies, + an evil sorceress, and her own childhood to become queen + of the world. + + + Corets, Eva + Maeve Ascendant + Fantasy + 5.95 + 2000-11-17 + After the collapse of a nanotechnology + society in England, the young survivors lay the + foundation for a new society. + + + Corets, Eva + Oberon's Legacy + Fantasy + 5.95 + 2001-03-10 + In post-apocalypse England, the mysterious + agent known only as Oberon helps to create a new life + for the inhabitants of London. Sequel to Maeve + Ascendant. + + + Corets, Eva + The Sundered Grail + Fantasy + 5.95 + 2001-09-10 + The two daughters of Maeve, half-sisters, + battle one another for control of England. Sequel to + Oberon's Legacy. + + + Randall, Cynthia + Lover Birds + Romance + 4.95 + 2000-09-02 + When Carla meets Paul at an ornithology + conference, tempers fly as feathers get ruffled. + + + Thurman, Paula + Splish Splash + Romance + 4.95 + 2000-11-02 + A deep sea diver finds true love twenty + thousand leagues beneath the sea. + + + Knorr, Stefan + Creepy Crawlies + Horror + 4.95 + 2000-12-06 + An anthology of horror stories about roaches, + centipedes, scorpions and other insects. + + + Kress, Peter + Paradox Lost + Science Fiction + 6.95 + 2000-11-02 + After an inadvertant trip through a Heisenberg + Uncertainty Device, James Salway discovers the problems + of being quantum. + + + O'Brien, Tim + Microsoft .NET: The Programming Bible + Computer + 36.95 + 2000-12-09 + Microsoft's .NET initiative is explored in + detail in this deep programmer's reference. + + + O'Brien, Tim + MSXML3: A Comprehensive Guide + Computer + 36.95 + 2000-12-01 + The Microsoft MSXML3 parser is covered in + detail, with attention to XML DOM interfaces, XSLT processing, + SAX and more. + + + Galos, Mike + Visual Studio 7: A Comprehensive Guide + Computer + 49.95 + 2001-04-16 + Microsoft Visual Studio 7 is explored in depth, + looking at how Visual Basic, Visual C++, C#, and ASP+ are + integrated into a comprehensive development + environment. + + \ No newline at end of file diff --git a/examples/fireworks/inputs/example.json b/examples/fireworks/inputs/example.json new file mode 100644 index 00000000..2263184c --- /dev/null +++ b/examples/fireworks/inputs/example.json @@ -0,0 +1,182 @@ +{ + "kind":"youtube#searchListResponse", + "etag":"q4ibjmYp1KA3RqMF4jFLl6PBwOg", + "nextPageToken":"CAUQAA", + "regionCode":"NL", + "pageInfo":{ + "totalResults":1000000, + "resultsPerPage":5 + }, + "items":[ + { + "kind":"youtube#searchResult", + "etag":"QCsHBifbaernVCbLv8Cu6rAeaDQ", + "id":{ + "kind":"youtube#video", + "videoId":"TvWDY4Mm5GM" + }, + "snippet":{ + "publishedAt":"2023-07-24T14:15:01Z", + "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", + "title":"3 Football Clubs Kylian Mbappe Should Avoid Signing ✍️❌⚽️ #football #mbappe #shorts", + "description":"", + "thumbnails":{ + "default":{ + "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/default.jpg", + "width":120, + "height":90 + }, + "medium":{ + "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/mqdefault.jpg", + "width":320, + "height":180 + }, + "high":{ + "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/hqdefault.jpg", + "width":480, + "height":360 + } + }, + "channelTitle":"FC Motivate", + "liveBroadcastContent":"none", + "publishTime":"2023-07-24T14:15:01Z" + } + }, + { + "kind":"youtube#searchResult", + "etag":"0NG5QHdtIQM_V-DBJDEf-jK_Y9k", + "id":{ + "kind":"youtube#video", + "videoId":"aZM_42CcNZ4" + }, + "snippet":{ + "publishedAt":"2023-07-24T16:09:27Z", + "channelId":"UCM5gMM_HqfKHYIEJ3lstMUA", + "title":"Which Football Club Could Cristiano Ronaldo Afford To Buy? 💰", + "description":"Sign up to Sorare and get a FREE card: https://sorare.pxf.io/NellisShorts Give Soraredata a go for FREE: ...", + "thumbnails":{ + "default":{ + "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/default.jpg", + "width":120, + "height":90 + }, + "medium":{ + "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/mqdefault.jpg", + "width":320, + "height":180 + }, + "high":{ + "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/hqdefault.jpg", + "width":480, + "height":360 + } + }, + "channelTitle":"John Nellis", + "liveBroadcastContent":"none", + "publishTime":"2023-07-24T16:09:27Z" + } + }, + { + "kind":"youtube#searchResult", + "etag":"WbBz4oh9I5VaYj91LjeJvffrBVY", + "id":{ + "kind":"youtube#video", + "videoId":"wkP3XS3aNAY" + }, + "snippet":{ + "publishedAt":"2023-07-24T16:00:50Z", + "channelId":"UC4EP1dxFDPup_aFLt0ElsDw", + "title":"PAULO DYBALA vs THE WORLD'S LONGEST FREEKICK WALL", + "description":"Can Paulo Dybala curl a football around the World's longest free kick wall? We met up with the World Cup winner and put him to ...", + "thumbnails":{ + "default":{ + "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/default.jpg", + "width":120, + "height":90 + }, + "medium":{ + "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/mqdefault.jpg", + "width":320, + "height":180 + }, + "high":{ + "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/hqdefault.jpg", + "width":480, + "height":360 + } + }, + "channelTitle":"Shoot for Love", + "liveBroadcastContent":"none", + "publishTime":"2023-07-24T16:00:50Z" + } + }, + { + "kind":"youtube#searchResult", + "etag":"juxv_FhT_l4qrR05S1QTrb4CGh8", + "id":{ + "kind":"youtube#video", + "videoId":"rJkDZ0WvfT8" + }, + "snippet":{ + "publishedAt":"2023-07-24T10:00:39Z", + "channelId":"UCO8qj5u80Ga7N_tP3BZWWhQ", + "title":"TOP 10 DEFENDERS 2023", + "description":"SoccerKingz https://soccerkingz.nl Use code: 'ILOVEHOF' to get 10% off. TOP 10 DEFENDERS 2023 Follow us! • Instagram ...", + "thumbnails":{ + "default":{ + "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/default.jpg", + "width":120, + "height":90 + }, + "medium":{ + "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/mqdefault.jpg", + "width":320, + "height":180 + }, + "high":{ + "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/hqdefault.jpg", + "width":480, + "height":360 + } + }, + "channelTitle":"Home of Football", + "liveBroadcastContent":"none", + "publishTime":"2023-07-24T10:00:39Z" + } + }, + { + "kind":"youtube#searchResult", + "etag":"wtuknXTmI1txoULeH3aWaOuXOow", + "id":{ + "kind":"youtube#video", + "videoId":"XH0rtu4U6SE" + }, + "snippet":{ + "publishedAt":"2023-07-21T16:30:05Z", + "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", + "title":"3 Things You Didn't Know About Erling Haaland ⚽️🇳🇴 #football #haaland #shorts", + "description":"", + "thumbnails":{ + "default":{ + "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/default.jpg", + "width":120, + "height":90 + }, + "medium":{ + "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/mqdefault.jpg", + "width":320, + "height":180 + }, + "high":{ + "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/hqdefault.jpg", + "width":480, + "height":360 + } + }, + "channelTitle":"FC Motivate", + "liveBroadcastContent":"none", + "publishTime":"2023-07-21T16:30:05Z" + } + } + ] +} \ No newline at end of file diff --git a/examples/fireworks/inputs/plain_html_example.txt b/examples/fireworks/inputs/plain_html_example.txt new file mode 100644 index 00000000..78f814ae --- /dev/null +++ b/examples/fireworks/inputs/plain_html_example.txt @@ -0,0 +1,105 @@ + +
+ + +
+
+
+
+
+
+

Projects

+

+
+
+ +
+
+
+ +
+ \ No newline at end of file diff --git a/examples/fireworks/inputs/username.csv b/examples/fireworks/inputs/username.csv new file mode 100644 index 00000000..006ac8e6 --- /dev/null +++ b/examples/fireworks/inputs/username.csv @@ -0,0 +1,7 @@ +Username; Identifier;First name;Last name +booker12;9012;Rachel;Booker +grey07;2070;Laura;Grey +johnson81;4081;Craig;Johnson +jenkins46;9346;Mary;Jenkins +smith79;5079;Jamie;Smith + diff --git a/examples/fireworks/json_scraper_fireworkspy.py b/examples/fireworks/json_scraper_fireworkspy.py new file mode 100644 index 00000000..0dd188fb --- /dev/null +++ b/examples/fireworks/json_scraper_fireworkspy.py @@ -0,0 +1,65 @@ +""" +Basic example of scraping pipeline using JSONScraperGraph from JSON documents +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import JSONScraperGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info +load_dotenv() + +# ************************************************ +# Read the JSON file +# ************************************************ + +FILE_NAME = "inputs/example.json" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +fireworks_api_key = os.getenv("FIREWORKS_APIKEY") + +graph_config = { + "llm": { + "api_key": fireworks_api_key, + "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "verbose": True, + "headless": False, +} + +# ************************************************ +# Create the JSONScraperGraph instance and run it +# ************************************************ + +json_scraper_graph = JSONScraperGraph( + prompt="List me all the authors, title and genres of the books", + source=text, # Pass the content of the file, not the file object + config=graph_config +) + +result = json_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = json_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json or csv +convert_to_csv(result, "result") +convert_to_json(result, "result") + diff --git a/examples/fireworks/json_scraper_multi_fireworks.py b/examples/fireworks/json_scraper_multi_fireworks.py new file mode 100644 index 00000000..b4cf4fc7 --- /dev/null +++ b/examples/fireworks/json_scraper_multi_fireworks.py @@ -0,0 +1,44 @@ +""" +Module for showing how PDFScraper multi works +""" +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import JSONScraperMultiGraph + +load_dotenv() + +fireworks_api_key = os.getenv("FIREWORKS_APIKEY") + +graph_config = { + "llm": { + "api_key": fireworks_api_key, + "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "verbose": True, + "headless": False, +} + +FILE_NAME = "inputs/example.json" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + +sources = [text, text] + +multiple_search_graph = JSONScraperMultiGraph( + prompt= "List me all the authors, title and genres of the books", + source= sources, + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/examples/fireworks/pdf_scraper_fireworks.py b/examples/fireworks/pdf_scraper_fireworks.py new file mode 100644 index 00000000..20db556b --- /dev/null +++ b/examples/fireworks/pdf_scraper_fireworks.py @@ -0,0 +1,45 @@ +import os, json +from dotenv import load_dotenv +from scrapegraphai.graphs import PDFScraperGraph + +load_dotenv() + + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +fireworks_api_key = os.getenv("FIREWORKS_APIKEY") + +graph_config = { + "llm": { + "api_key": fireworks_api_key, + "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "verbose": True, +} + +source = """ + The Divine Comedy, Italian La Divina Commedia, original name La commedia, long narrative poem written in Italian + circa 1308/21 by Dante. It is usually held to be one of the world s great works of literature. + Divided into three major sections—Inferno, Purgatorio, and Paradiso—the narrative traces the journey of Dante + from darkness and error to the revelation of the divine light, culminating in the Beatific Vision of God. + Dante is guided by the Roman poet Virgil, who represents the epitome of human knowledge, from the dark wood + through the descending circles of the pit of Hell (Inferno). He then climbs the mountain of Purgatory, guided + by the Roman poet Statius, who represents the fulfilment of human knowledge, and is finally led by his lifelong love, + the Beatrice of his earlier poetry, through the celestial spheres of Paradise. +""" + +pdf_scraper_graph = PDFScraperGraph( + prompt="Summarize the text and find the main topics", + source=source, + config=graph_config, +) +result = pdf_scraper_graph.run() + +print(json.dumps(result, indent=4)) diff --git a/examples/fireworks/pdf_scraper_multi_fireworks.py b/examples/fireworks/pdf_scraper_multi_fireworks.py new file mode 100644 index 00000000..891a4454 --- /dev/null +++ b/examples/fireworks/pdf_scraper_multi_fireworks.py @@ -0,0 +1,69 @@ +""" +Module for showing how PDFScraper multi works +""" +import os +import json +from typing import List +from dotenv import load_dotenv +from pydantic import BaseModel, Field +from scrapegraphai.graphs import PdfScraperMultiGraph + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +fireworks_api_key = os.getenv("FIREWORKS_APIKEY") + +graph_config = { + "llm": { + "api_key": fireworks_api_key, + "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "verbose": True, +} + +# ************************************************ +# Define the output schema for the graph +# ************************************************ + +class Article(BaseModel): + independent_variable: str = Field(description="(IV): The variable that is manipulated or considered as the primary cause affecting other variables.") + dependent_variable: str = Field(description="(DV) The variable that is measured or observed, which is expected to change as a result of variations in the Independent Variable.") + exogenous_shock: str = Field(description="Identify any external or unexpected events used in the study that serve as a natural experiment or provide a unique setting for observing the effects on the IV and DV.") + +class Articles(BaseModel): + articles: List[Article] + +# ************************************************ +# Define the sources for the graph +# ************************************************ + +sources = [ + "This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather the interaction between call center architecture and outdoor weather conditions in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.", + "The diffusion of social media coincided with a worsening of mental health conditions among adolescents and young adults in the United States, giving rise to speculation that social media might be detrimental to mental health. Our analysis couples data on student mental health around the years of Facebook's expansion with a generalized difference-in-differences empirical strategy. We find that the roll-out of Facebook at a college increased symptoms of poor mental health, especially depression. We also find that, among students predicted to be most susceptible to mental illness, the introduction of Facebook led to increased utilization of mental healthcare services. Lastly, we find that, after the introduction of Facebook, students were more likely to report experiencing impairments to academic performance resulting from poor mental health. Additional evidence on mechanisms suggests that the results are due to Facebook fostering unfavorable social comparisons." +] + +prompt = """ +Analyze the abstracts provided from an academic journal article to extract and clearly identify the Independent Variable (IV), Dependent Variable (DV), and Exogenous Shock. +""" + +# ******************************************************* +# Create the SmartScraperMultiGraph instance and run it +# ******************************************************* + +multiple_search_graph = PdfScraperMultiGraph( + prompt=prompt, + source= sources, + schema=Articles, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/examples/fireworks/scrape_plain_text_fireworks.py b/examples/fireworks/scrape_plain_text_fireworks.py new file mode 100644 index 00000000..a45b2691 --- /dev/null +++ b/examples/fireworks/scrape_plain_text_fireworks.py @@ -0,0 +1,62 @@ +""" +Basic example of scraping pipeline using SmartScraper from text +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +# ************************************************ +# Read the text file +# ************************************************ + +FILE_NAME = "inputs/plain_html_example.txt" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +# It could be also a http request using the request model +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +fireworks_api_key = os.getenv("FIREWORKS_APIKEY") + +graph_config = { + "llm": { + "api_key": fireworks_api_key, + "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, +} + + + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me all the projects with their description.", + source=text, + config=graph_config +) + +result = smart_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/fireworks/script_generator_fireworks.py b/examples/fireworks/script_generator_fireworks.py new file mode 100644 index 00000000..dea59e12 --- /dev/null +++ b/examples/fireworks/script_generator_fireworks.py @@ -0,0 +1,54 @@ +""" +Basic example of scraping pipeline using ScriptCreatorGraph +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import ScriptCreatorGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +fireworks_api_key = os.getenv("FIREWORKS_APIKEY") + +graph_config = { + "llm": { + "api_key": fireworks_api_key, + "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "verbose": True, + "headless": False, + "library": "beautifulsoup" + +} + +# ************************************************ +# Create the ScriptCreatorGraph instance and run it +# ************************************************ + +script_creator_graph = ScriptCreatorGraph( + prompt="List me all the projects with their description.", + # also accepts a string with the already downloaded HTML code + source="https://perinim.github.io/projects", + config=graph_config +) + +result = script_creator_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = script_creator_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + diff --git a/examples/fireworks/script_generator_schema_fireworks.py b/examples/fireworks/script_generator_schema_fireworks.py new file mode 100644 index 00000000..f7aa4c83 --- /dev/null +++ b/examples/fireworks/script_generator_schema_fireworks.py @@ -0,0 +1,66 @@ +""" +Basic example of scraping pipeline using ScriptCreatorGraph +""" + +import os +from typing import List +from dotenv import load_dotenv +from pydantic import BaseModel, Field +from scrapegraphai.graphs import ScriptCreatorGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +# ************************************************ +# Define the schema for the graph +# ************************************************ + +class Project(BaseModel): + title: str = Field(description="The title of the project") + description: str = Field(description="The description of the project") + +class Projects(BaseModel): + projects: List[Project] + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +fireworks_api_key = os.getenv("FIREWORKS_APIKEY") + +graph_config = { + "llm": { + "api_key": fireworks_api_key, + "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "verbose": True, + "library": "beautifulsoup", +} + +# ************************************************ +# Create the ScriptCreatorGraph instance and run it +# ************************************************ + +script_creator_graph = ScriptCreatorGraph( + prompt="List me all the projects with their description.", + # also accepts a string with the already downloaded HTML code + source="https://perinim.github.io/projects", + config=graph_config, + schema=Projects +) + +result = script_creator_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = script_creator_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + diff --git a/examples/fireworks/script_multi_generator_fireworks.py b/examples/fireworks/script_multi_generator_fireworks.py new file mode 100644 index 00000000..42aff923 --- /dev/null +++ b/examples/fireworks/script_multi_generator_fireworks.py @@ -0,0 +1,58 @@ +""" +Basic example of scraping pipeline using ScriptCreatorGraph +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import ScriptCreatorMultiGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +fireworks_api_key = os.getenv("FIREWORKS_APIKEY") + +graph_config = { + "llm": { + "api_key": fireworks_api_key, + "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "verbose": True, + "library": "beautifulsoup", +} +# ************************************************ +# Create the ScriptCreatorGraph instance and run it +# ************************************************ + +urls=[ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" +] + +# ************************************************ +# Create the ScriptCreatorGraph instance and run it +# ************************************************ + +script_creator_graph = ScriptCreatorMultiGraph( + prompt="Who is Marco Perini?", + source=urls, + config=graph_config +) + +result = script_creator_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = script_creator_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/fireworks/search_graph_fireworks.py b/examples/fireworks/search_graph_fireworks.py new file mode 100644 index 00000000..545bbde8 --- /dev/null +++ b/examples/fireworks/search_graph_fireworks.py @@ -0,0 +1,56 @@ +""" +Example of Search Graph +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SearchGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +fireworks_api_key = os.getenv("FIREWORKS_APIKEY") + +graph_config = { + "llm": { + "api_key": fireworks_api_key, + "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "max_results": 2, + "verbose": True, + "headless": False, +} + + + +# ************************************************ +# Create the SearchGraph instance and run it +# ************************************************ + +search_graph = SearchGraph( + prompt="List me Chioggia's famous dishes", + config=graph_config +) + +result = search_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = search_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json and csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/fireworks/search_graph_schema_fireworks.py b/examples/fireworks/search_graph_schema_fireworks.py new file mode 100644 index 00000000..9180522b --- /dev/null +++ b/examples/fireworks/search_graph_schema_fireworks.py @@ -0,0 +1,68 @@ +""" +Example of Search Graph +""" + +import os +from dotenv import load_dotenv +load_dotenv() + +from scrapegraphai.graphs import SearchGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info + +from pydantic import BaseModel, Field +from typing import List + +# ************************************************ +# Define the output schema for the graph +# ************************************************ + +class Dish(BaseModel): + name: str = Field(description="The name of the dish") + description: str = Field(description="The description of the dish") + +class Dishes(BaseModel): + dishes: List[Dish] + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +fireworks_api_key = os.getenv("FIREWORKS_APIKEY") + +graph_config = { + "llm": { + "api_key": fireworks_api_key, + "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "max_results": 2, + "verbose": True, + "headless": False, +} +# ************************************************ +# Create the SearchGraph instance and run it +# ************************************************ + +search_graph = SearchGraph( + prompt="List me Chioggia's famous dishes", + config=graph_config, + schema=Dishes +) + +result = search_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = search_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json and csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/fireworks/smart_scraper_fireworks.py b/examples/fireworks/smart_scraper_fireworks.py new file mode 100644 index 00000000..40071d8f --- /dev/null +++ b/examples/fireworks/smart_scraper_fireworks.py @@ -0,0 +1,52 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" + +import os, json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +fireworks_api_key = os.getenv("FIREWORKS_APIKEY") + +graph_config = { + "llm": { + "api_key": fireworks_api_key, + "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "verbose": True, + "headless": False, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me all the projects with their description", + # also accepts a string with the already downloaded HTML code + source="https://perinim.github.io/projects/", + config=graph_config, +) + +result = smart_scraper_graph.run() +print(json.dumps(result, indent=4)) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/fireworks/smart_scraper_multi_fireworks.py b/examples/fireworks/smart_scraper_multi_fireworks.py new file mode 100644 index 00000000..68e28055 --- /dev/null +++ b/examples/fireworks/smart_scraper_multi_fireworks.py @@ -0,0 +1,46 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" + +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperMultiGraph + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ +fireworks_api_key = os.getenv("FIREWORKS_APIKEY") + +graph_config = { + "llm": { + "api_key": fireworks_api_key, + "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "verbose": True, + "headless": False, +} + +# ******************************************************* +# Create the SmartScraperMultiGraph instance and run it +# ******************************************************* + +multiple_search_graph = SmartScraperMultiGraph( + prompt="Who is Marco Perini?", + source= [ + "https://perinim.github.io/", + "https://perinim.github.io/cv/" + ], + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/examples/fireworks/smart_scraper_schema_fireworks.py b/examples/fireworks/smart_scraper_schema_fireworks.py new file mode 100644 index 00000000..b8685c3e --- /dev/null +++ b/examples/fireworks/smart_scraper_schema_fireworks.py @@ -0,0 +1,55 @@ +""" +Basic example of scraping pipeline using SmartScraper with schema +""" + +import os, json +from typing import List +from dotenv import load_dotenv +from pydantic import BaseModel, Field +from scrapegraphai.graphs import SmartScraperGraph + +load_dotenv() + +# ************************************************ +# Define the output schema for the graph +# ************************************************ + +class Project(BaseModel): + title: str = Field(description="The title of the project") + description: str = Field(description="The description of the project") + +class Projects(BaseModel): + projects: List[Project] + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +fireworks_api_key = os.getenv("FIREWORKS_APIKEY") + +graph_config = { + "llm": { + "api_key": fireworks_api_key, + "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "verbose": True, + "headless": False, +} +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me all the projects with their description", + source="https://perinim.github.io/projects/", + schema=Projects, + config=graph_config +) + +result = smart_scraper_graph.run() +print(result) diff --git a/examples/fireworks/xml_scraper_fireworks.py b/examples/fireworks/xml_scraper_fireworks.py new file mode 100644 index 00000000..efc98bd8 --- /dev/null +++ b/examples/fireworks/xml_scraper_fireworks.py @@ -0,0 +1,64 @@ +""" +Basic example of scraping pipeline using XMLScraperGraph from XML documents +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import XMLScraperGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info +load_dotenv() + +# ************************************************ +# Read the XML file +# ************************************************ + +FILE_NAME = "inputs/books.xml" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ +fireworks_api_key = os.getenv("FIREWORKS_APIKEY") + +graph_config = { + "llm": { + "api_key": fireworks_api_key, + "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "verbose": True, + "headless": False, +} + +# ************************************************ +# Create the XMLScraperGraph instance and run it +# ************************************************ + +xml_scraper_graph = XMLScraperGraph( + prompt="List me all the authors, title and genres of the books", + source=text, # Pass the content of the file, not the file object + config=graph_config +) + +result = xml_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = xml_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json or csv +convert_to_csv(result, "result") +convert_to_json(result, "result") + diff --git a/examples/fireworks/xml_scraper_graph_multi_fireworks.py b/examples/fireworks/xml_scraper_graph_multi_fireworks.py new file mode 100644 index 00000000..d14b8db0 --- /dev/null +++ b/examples/fireworks/xml_scraper_graph_multi_fireworks.py @@ -0,0 +1,63 @@ +""" +Basic example of scraping pipeline using XMLScraperMultiGraph from XML documents +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import XMLScraperMultiGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info +load_dotenv() + +# ************************************************ +# Read the XML file +# ************************************************ + +FILE_NAME = "inputs/books.xml" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +fireworks_api_key = os.getenv("FIREWORKS_APIKEY") + +graph_config = { + "llm": { + "api_key": fireworks_api_key, + "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "verbose": True, + "headless": False, +} +# ************************************************ +# Create the XMLScraperMultiGraph instance and run it +# ************************************************ + +xml_scraper_graph = XMLScraperMultiGraph( + prompt="List me all the authors, title and genres of the books", + source=[text, text], # Pass the content of the file, not the file object + config=graph_config +) + +result = xml_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = xml_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json or csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/pyproject.toml b/pyproject.toml index a1b266f1..8db07501 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,6 +35,7 @@ dependencies = [ "semchunk==1.0.1", "html2text==2024.2.26", "trafilatura==1.10.0", + "langchain-fireworks==0.1.3" ] license = "MIT" diff --git a/requirements-dev.lock b/requirements-dev.lock index 0a086bf2..963ceaa9 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -8,11 +8,12 @@ # with-sources: false -e file:. -aiofiles==24.1.0 +aiofiles==23.2.1 # via burr aiohttp==3.9.5 # via langchain # via langchain-community + # via langchain-fireworks aiosignal==1.3.1 # via aiohttp alabaster==0.7.16 @@ -21,26 +22,20 @@ altair==5.3.0 # via streamlit annotated-types==0.7.0 # via pydantic -anthropic==0.30.0 +anthropic==0.26.1 # via langchain-anthropic -anyio==4.4.0 +anyio==4.3.0 # via anthropic # via groq # via httpx # via openai # via starlette # via watchfiles -astroid==3.2.2 - # via pylint -async-timeout==4.0.3 - # via aiohttp - # via langchain attrs==23.2.0 # via aiohttp # via jsonschema # via referencing babel==2.15.0 - # via courlan # via sphinx beautifulsoup4==4.12.3 # via furo @@ -48,9 +43,9 @@ beautifulsoup4==4.12.3 # via scrapegraphai blinker==1.8.2 # via streamlit -boto3==1.34.134 +boto3==1.34.113 # via langchain-aws -botocore==1.34.134 +botocore==1.34.113 # via boto3 # via s3transfer burr==0.22.1 @@ -58,15 +53,12 @@ burr==0.22.1 cachetools==5.3.3 # via google-auth # via streamlit -certifi==2024.6.2 +certifi==2024.2.2 # via httpcore # via httpx # via requests - # via trafilatura charset-normalizer==3.3.2 - # via htmldate # via requests - # via trafilatura click==8.1.7 # via burr # via streamlit @@ -74,19 +66,13 @@ click==8.1.7 # via uvicorn contourpy==1.2.1 # via matplotlib -courlan==1.2.0 - # via trafilatura cycler==0.12.1 # via matplotlib -dataclasses-json==0.6.7 +dataclasses-json==0.6.6 # via langchain # via langchain-community -dateparser==1.2.0 - # via htmldate defusedxml==0.7.1 # via langchain-anthropic -dill==0.3.8 - # via pylint distro==1.9.0 # via anthropic # via groq @@ -95,29 +81,29 @@ dnspython==2.6.1 # via email-validator docutils==0.19 # via sphinx -email-validator==2.2.0 +email-validator==2.1.1 # via fastapi -exceptiongroup==1.2.1 - # via anyio - # via pytest faiss-cpu==1.8.0 # via scrapegraphai fastapi==0.111.0 # via burr + # via fastapi-pagination fastapi-cli==0.0.4 # via fastapi -fastapi-pagination==0.12.25 +fastapi-pagination==0.12.24 # via burr -filelock==3.15.4 +filelock==3.14.0 # via huggingface-hub -fonttools==4.53.0 +fireworks-ai==0.14.0 + # via langchain-fireworks +fonttools==4.52.1 # via matplotlib free-proxy==1.1.1 # via scrapegraphai frozenlist==1.4.1 # via aiohttp # via aiosignal -fsspec==2024.6.1 +fsspec==2024.5.0 # via huggingface-hub furo==2024.5.6 # via scrapegraphai @@ -129,13 +115,13 @@ google==3.0.0 # via scrapegraphai google-ai-generativelanguage==0.6.4 # via google-generativeai -google-api-core==2.19.1 +google-api-core==2.19.0 # via google-ai-generativelanguage # via google-api-python-client # via google-generativeai -google-api-python-client==2.134.0 +google-api-python-client==2.130.0 # via google-generativeai -google-auth==2.30.0 +google-auth==2.29.0 # via google-ai-generativelanguage # via google-api-core # via google-api-python-client @@ -145,7 +131,7 @@ google-auth-httplib2==0.2.0 # via google-api-python-client google-generativeai==0.5.4 # via langchain-google-genai -googleapis-common-protos==1.63.2 +googleapis-common-protos==1.63.0 # via google-api-core # via grpcio-status graphviz==0.20.3 @@ -153,10 +139,9 @@ graphviz==0.20.3 # via scrapegraphai greenlet==3.0.3 # via playwright - # via sqlalchemy -groq==0.9.0 +groq==0.8.0 # via langchain-groq -grpcio==1.64.1 +grpcio==1.64.0 # via google-api-core # via grpcio-status grpcio-status==1.62.2 @@ -166,8 +151,6 @@ h11==0.14.0 # via uvicorn html2text==2024.2.26 # via scrapegraphai -htmldate==1.8.1 - # via trafilatura httpcore==1.0.5 # via httpx httplib2==0.22.0 @@ -178,9 +161,12 @@ httptools==0.6.1 httpx==0.27.0 # via anthropic # via fastapi + # via fireworks-ai # via groq # via openai -huggingface-hub==0.23.4 +httpx-sse==0.4.0 + # via fireworks-ai +huggingface-hub==0.23.1 # via tokenizers idna==3.7 # via anyio @@ -190,21 +176,15 @@ idna==3.7 # via yarl imagesize==1.4.1 # via sphinx -importlib-metadata==8.0.0 - # via sphinx -importlib-resources==6.4.0 - # via matplotlib iniconfig==2.0.0 # via pytest -isort==5.13.2 - # via pylint jinja2==3.1.4 # via altair # via burr # via fastapi # via pydeck # via sphinx -jiter==0.5.0 +jiter==0.4.0 # via anthropic jmespath==1.0.1 # via boto3 @@ -212,14 +192,12 @@ jmespath==1.0.1 jsonpatch==1.33 # via langchain # via langchain-core -jsonpointer==3.0.0 +jsonpointer==2.4 # via jsonpatch jsonschema==4.22.0 # via altair jsonschema-specifications==2023.12.1 # via jsonschema -justext==3.0.1 - # via trafilatura kiwisolver==1.4.5 # via matplotlib langchain==0.1.15 @@ -235,10 +213,13 @@ langchain-core==0.1.52 # via langchain-anthropic # via langchain-aws # via langchain-community + # via langchain-fireworks # via langchain-google-genai # via langchain-groq # via langchain-openai # via langchain-text-splitters +langchain-fireworks==0.1.3 + # via scrapegraphai langchain-google-genai==1.0.3 # via scrapegraphai langchain-groq==0.1.3 @@ -247,7 +228,7 @@ langchain-openai==0.1.6 # via scrapegraphai langchain-text-splitters==0.0.2 # via langchain -langsmith==0.1.82 +langsmith==0.1.63 # via langchain # via langchain-community # via langchain-core @@ -255,22 +236,14 @@ loguru==0.7.2 # via burr lxml==5.2.2 # via free-proxy - # via htmldate - # via justext - # via lxml-html-clean - # via trafilatura -lxml-html-clean==0.1.1 - # via lxml markdown-it-py==3.0.0 # via rich markupsafe==2.1.5 # via jinja2 -marshmallow==3.21.3 +marshmallow==3.21.2 # via dataclasses-json matplotlib==3.9.0 # via burr -mccabe==0.7.0 - # via pylint mdurl==0.1.2 # via markdown-it-py minify-html==0.15.0 @@ -293,10 +266,11 @@ numpy==1.26.4 # via pydeck # via sf-hamilton # via streamlit -openai==1.35.6 +openai==1.30.3 # via burr + # via langchain-fireworks # via langchain-openai -orjson==3.10.5 +orjson==3.10.3 # via fastapi # via langsmith packaging==23.2 @@ -314,16 +288,15 @@ pandas==2.2.2 # via sf-hamilton # via streamlit pillow==10.3.0 + # via fireworks-ai # via matplotlib # via streamlit -platformdirs==4.2.2 - # via pylint playwright==1.43.0 # via scrapegraphai # via undetected-playwright pluggy==1.5.0 # via pytest -proto-plus==1.24.0 +proto-plus==1.23.0 # via google-ai-generativelanguage # via google-api-core protobuf==4.25.3 @@ -341,18 +314,19 @@ pyasn1==0.6.0 # via rsa pyasn1-modules==0.4.0 # via google-auth -pydantic==2.7.4 +pydantic==2.7.1 # via anthropic # via burr # via fastapi # via fastapi-pagination + # via fireworks-ai # via google-generativeai # via groq # via langchain # via langchain-core # via langsmith # via openai -pydantic-core==2.18.4 +pydantic-core==2.18.2 # via pydantic pydeck==0.9.1 # via streamlit @@ -362,7 +336,6 @@ pygments==2.18.0 # via furo # via rich # via sphinx -pylint==3.2.5 pyparsing==3.1.2 # via httplib2 # via matplotlib @@ -371,8 +344,6 @@ pytest==8.0.0 pytest-mock==3.14.0 python-dateutil==2.9.0.post0 # via botocore - # via dateparser - # via htmldate # via matplotlib # via pandas python-dotenv==1.0.1 @@ -381,7 +352,6 @@ python-dotenv==1.0.1 python-multipart==0.0.9 # via fastapi pytz==2024.1 - # via dateparser # via pandas pyyaml==6.0.1 # via huggingface-hub @@ -393,15 +363,15 @@ referencing==0.35.1 # via jsonschema # via jsonschema-specifications regex==2024.5.15 - # via dateparser # via tiktoken -requests==2.32.3 +requests==2.32.2 # via burr # via free-proxy # via google-api-core # via huggingface-hub # via langchain # via langchain-community + # via langchain-fireworks # via langsmith # via sphinx # via streamlit @@ -414,11 +384,11 @@ rpds-py==0.18.1 # via referencing rsa==4.9 # via google-auth -s3transfer==0.10.2 +s3transfer==0.10.1 # via boto3 semchunk==1.0.1 # via scrapegraphai -sf-hamilton==1.67.0 +sf-hamilton==1.63.0 # via burr shellingham==1.5.4 # via typer @@ -454,14 +424,14 @@ sphinxcontrib-qthelp==1.0.7 # via sphinx sphinxcontrib-serializinghtml==1.1.10 # via sphinx -sqlalchemy==2.0.31 +sqlalchemy==2.0.30 # via langchain # via langchain-community starlette==0.37.2 # via fastapi -streamlit==1.36.0 +streamlit==1.35.0 # via burr -tenacity==8.4.2 +tenacity==8.3.0 # via langchain # via langchain-community # via langchain-core @@ -469,20 +439,13 @@ tenacity==8.4.2 tiktoken==0.6.0 # via langchain-openai # via scrapegraphai -tld==0.13 - # via courlan tokenizers==0.19.1 # via anthropic toml==0.10.2 # via streamlit -tomli==2.0.1 - # via pylint - # via pytest -tomlkit==0.12.5 - # via pylint toolz==0.12.1 # via altair -tornado==6.4.1 +tornado==6.4 # via streamlit tqdm==4.66.4 # via google-generativeai @@ -490,15 +453,10 @@ tqdm==4.66.4 # via openai # via scrapegraphai # via semchunk -trafilatura==1.10.0 - # via scrapegraphai typer==0.12.3 # via fastapi-cli -typing-extensions==4.12.2 - # via altair +typing-extensions==4.12.0 # via anthropic - # via anyio - # via astroid # via fastapi # via fastapi-pagination # via google-generativeai @@ -508,44 +466,33 @@ typing-extensions==4.12.2 # via pydantic # via pydantic-core # via pyee - # via pylint # via sf-hamilton # via sqlalchemy - # via starlette # via streamlit # via typer # via typing-inspect - # via uvicorn typing-inspect==0.9.0 # via dataclasses-json # via sf-hamilton tzdata==2024.1 # via pandas -tzlocal==5.2 - # via dateparser ujson==5.10.0 # via fastapi undetected-playwright==0.3.0 # via scrapegraphai uritemplate==4.1.1 # via google-api-python-client -urllib3==1.26.19 +urllib3==1.26.18 # via botocore - # via courlan - # via htmldate # via requests - # via trafilatura -uvicorn==0.30.1 +uvicorn==0.29.0 # via burr # via fastapi uvloop==0.19.0 # via uvicorn -watchfiles==0.22.0 +watchfiles==0.21.0 # via uvicorn websockets==12.0 # via uvicorn yarl==1.9.4 # via aiohttp -zipp==3.19.2 - # via importlib-metadata - # via importlib-resources diff --git a/requirements.lock b/requirements.lock index ba80c468..a27966ba 100644 --- a/requirements.lock +++ b/requirements.lock @@ -11,80 +11,69 @@ aiohttp==3.9.5 # via langchain # via langchain-community + # via langchain-fireworks aiosignal==1.3.1 # via aiohttp annotated-types==0.7.0 # via pydantic -anthropic==0.30.0 +anthropic==0.26.1 # via langchain-anthropic -anyio==4.4.0 +anyio==4.3.0 # via anthropic # via groq # via httpx # via openai -async-timeout==4.0.3 - # via aiohttp - # via langchain attrs==23.2.0 # via aiohttp -babel==2.15.0 - # via courlan beautifulsoup4==4.12.3 # via google # via scrapegraphai -boto3==1.34.134 +boto3==1.34.113 # via langchain-aws -botocore==1.34.134 +botocore==1.34.113 # via boto3 # via s3transfer cachetools==5.3.3 # via google-auth -certifi==2024.6.2 +certifi==2024.2.2 # via httpcore # via httpx # via requests - # via trafilatura charset-normalizer==3.3.2 - # via htmldate # via requests - # via trafilatura -courlan==1.2.0 - # via trafilatura -dataclasses-json==0.6.7 +dataclasses-json==0.6.6 # via langchain # via langchain-community -dateparser==1.2.0 - # via htmldate defusedxml==0.7.1 # via langchain-anthropic distro==1.9.0 # via anthropic # via groq # via openai -exceptiongroup==1.2.1 - # via anyio faiss-cpu==1.8.0 # via scrapegraphai -filelock==3.15.4 +filelock==3.14.0 # via huggingface-hub +fireworks-ai==0.14.0 + # via langchain-fireworks free-proxy==1.1.1 # via scrapegraphai frozenlist==1.4.1 # via aiohttp # via aiosignal -fsspec==2024.6.1 +fsspec==2024.5.0 # via huggingface-hub google==3.0.0 # via scrapegraphai google-ai-generativelanguage==0.6.4 # via google-generativeai -google-api-core==2.19.1 +google-api-core==2.19.0 # via google-ai-generativelanguage # via google-api-python-client # via google-generativeai -google-api-python-client==2.134.0 +google-api-python-client==2.130.0 # via google-generativeai -google-auth==2.30.0 +google-auth==2.29.0 # via google-ai-generativelanguage # via google-api-core # via google-api-python-client @@ -94,17 +83,16 @@ google-auth-httplib2==0.2.0 # via google-api-python-client google-generativeai==0.5.4 # via langchain-google-genai -googleapis-common-protos==1.63.2 +googleapis-common-protos==1.63.0 # via google-api-core # via grpcio-status graphviz==0.20.3 # via scrapegraphai greenlet==3.0.3 # via playwright - # via sqlalchemy -groq==0.9.0 +groq==0.8.0 # via langchain-groq -grpcio==1.64.1 +grpcio==1.64.0 # via google-api-core # via grpcio-status grpcio-status==1.62.2 @@ -113,8 +101,6 @@ h11==0.14.0 # via httpcore html2text==2024.2.26 # via scrapegraphai -htmldate==1.8.1 - # via trafilatura httpcore==1.0.5 # via httpx httplib2==0.22.0 @@ -122,16 +108,19 @@ httplib2==0.22.0 # via google-auth-httplib2 httpx==0.27.0 # via anthropic + # via fireworks-ai # via groq # via openai -huggingface-hub==0.23.4 +httpx-sse==0.4.0 + # via fireworks-ai +huggingface-hub==0.23.1 # via tokenizers idna==3.7 # via anyio # via httpx # via requests # via yarl -jiter==0.5.0 +jiter==0.4.0 # via anthropic jmespath==1.0.1 # via boto3 @@ -139,10 +128,8 @@ jmespath==1.0.1 jsonpatch==1.33 # via langchain # via langchain-core -jsonpointer==3.0.0 +jsonpointer==2.4 # via jsonpatch -justext==3.0.1 - # via trafilatura langchain==0.1.15 # via scrapegraphai langchain-anthropic==0.1.11 @@ -156,10 +143,13 @@ langchain-core==0.1.52 # via langchain-anthropic # via langchain-aws # via langchain-community + # via langchain-fireworks # via langchain-google-genai # via langchain-groq # via langchain-openai # via langchain-text-splitters +langchain-fireworks==0.1.3 + # via scrapegraphai langchain-google-genai==1.0.3 # via scrapegraphai langchain-groq==0.1.3 @@ -168,19 +158,13 @@ langchain-openai==0.1.6 # via scrapegraphai langchain-text-splitters==0.0.2 # via langchain -langsmith==0.1.82 +langsmith==0.1.63 # via langchain # via langchain-community # via langchain-core lxml==5.2.2 # via free-proxy - # via htmldate - # via justext - # via lxml-html-clean - # via trafilatura -lxml-html-clean==0.1.1 - # via lxml -marshmallow==3.21.3 +marshmallow==3.21.2 # via dataclasses-json minify-html==0.15.0 # via scrapegraphai @@ -195,9 +179,10 @@ numpy==1.26.4 # via langchain-aws # via langchain-community # via pandas -openai==1.35.6 +openai==1.30.3 + # via langchain-fireworks # via langchain-openai -orjson==3.10.5 +orjson==3.10.3 # via langsmith packaging==23.2 # via huggingface-hub @@ -205,10 +190,12 @@ packaging==23.2 # via marshmallow pandas==2.2.2 # via scrapegraphai +pillow==10.3.0 + # via fireworks-ai playwright==1.43.0 # via scrapegraphai # via undetected-playwright -proto-plus==1.24.0 +proto-plus==1.23.0 # via google-ai-generativelanguage # via google-api-core protobuf==4.25.3 @@ -223,15 +210,16 @@ pyasn1==0.6.0 # via rsa pyasn1-modules==0.4.0 # via google-auth -pydantic==2.7.4 +pydantic==2.7.1 # via anthropic + # via fireworks-ai # via google-generativeai # via groq # via langchain # via langchain-core # via langsmith # via openai -pydantic-core==2.18.4 +pydantic-core==2.18.2 # via pydantic pyee==11.1.0 # via playwright @@ -239,13 +227,10 @@ pyparsing==3.1.2 # via httplib2 python-dateutil==2.9.0.post0 # via botocore - # via dateparser - # via htmldate # via pandas python-dotenv==1.0.1 # via scrapegraphai pytz==2024.1 - # via dateparser # via pandas pyyaml==6.0.1 # via huggingface-hub @@ -253,19 +238,19 @@ pyyaml==6.0.1 # via langchain-community # via langchain-core regex==2024.5.15 - # via dateparser # via tiktoken -requests==2.32.3 +requests==2.32.2 # via free-proxy # via google-api-core # via huggingface-hub # via langchain # via langchain-community + # via langchain-fireworks # via langsmith # via tiktoken rsa==4.9 # via google-auth -s3transfer==0.10.2 +s3transfer==0.10.1 # via boto3 semchunk==1.0.1 # via scrapegraphai @@ -279,18 +264,16 @@ sniffio==1.3.1 # via openai soupsieve==2.5 # via beautifulsoup4 -sqlalchemy==2.0.31 +sqlalchemy==2.0.30 # via langchain # via langchain-community -tenacity==8.4.2 +tenacity==8.3.0 # via langchain # via langchain-community # via langchain-core tiktoken==0.6.0 # via langchain-openai # via scrapegraphai -tld==0.13 - # via courlan tokenizers==0.19.1 # via anthropic tqdm==4.66.4 @@ -299,11 +282,8 @@ tqdm==4.66.4 # via openai # via scrapegraphai # via semchunk -trafilatura==1.10.0 - # via scrapegraphai -typing-extensions==4.12.2 +typing-extensions==4.12.0 # via anthropic - # via anyio # via google-generativeai # via groq # via huggingface-hub @@ -317,17 +297,12 @@ typing-inspect==0.9.0 # via dataclasses-json tzdata==2024.1 # via pandas -tzlocal==5.2 - # via dateparser undetected-playwright==0.3.0 # via scrapegraphai uritemplate==4.1.1 # via google-api-python-client -urllib3==1.26.19 +urllib3==1.26.18 # via botocore - # via courlan - # via htmldate # via requests - # via trafilatura yarl==1.9.4 # via aiohttp diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index ccd3158a..c04b6efd 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -11,6 +11,7 @@ from langchain_community.embeddings import HuggingFaceHubEmbeddings, OllamaEmbeddings from langchain_google_genai import GoogleGenerativeAIEmbeddings from langchain_google_genai.embeddings import GoogleGenerativeAIEmbeddings +from langchain_fireworks import FireworksEmbeddings from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings from ..helpers import models_tokens @@ -23,7 +24,8 @@ HuggingFace, Ollama, OpenAI, - OneApi + OneApi, + Fireworks ) from ..models.ernie import Ernie from ..utils.logging import set_verbosity_debug, set_verbosity_warning, set_verbosity_info @@ -102,7 +104,7 @@ def __init__(self, prompt: str, config: dict, "embedder_model": self.embedder_model, "cache_path": self.cache_path, } - + self.set_common_params(common_params, overwrite=True) # set burr config @@ -125,7 +127,7 @@ def set_common_params(self, params: dict, overwrite=False): for node in self.graph.nodes: node.update_config(params, overwrite) - + def _create_llm(self, llm_config: dict, chat=False) -> object: """ Create a large language model instance based on the configuration provided. @@ -160,8 +162,15 @@ def _create_llm(self, llm_config: dict, chat=False) -> object: try: self.model_token = models_tokens["oneapi"][llm_params["model"]] except KeyError as exc: - raise KeyError("Model Model not supported") from exc + raise KeyError("Model not supported") from exc return OneApi(llm_params) + elif "fireworks" in llm_params["model"]: + try: + self.model_token = models_tokens["fireworks"][llm_params["model"].split("/")[-1]] + llm_params["model"] = "/".join(llm_params["model"].split("/")[1:]) + except KeyError as exc: + raise KeyError("Model not supported") from exc + return Fireworks(llm_params) elif "azure" in llm_params["model"]: # take the model after the last dash llm_params["model"] = llm_params["model"].split("/")[-1] @@ -172,12 +181,14 @@ def _create_llm(self, llm_config: dict, chat=False) -> object: return AzureOpenAI(llm_params) elif "gemini" in llm_params["model"]: + llm_params["model"] = llm_params["model"].split("/")[-1] try: self.model_token = models_tokens["gemini"][llm_params["model"]] except KeyError as exc: raise KeyError("Model not supported") from exc return Gemini(llm_params) elif llm_params["model"].startswith("claude"): + llm_params["model"] = llm_params["model"].split("/")[-1] try: self.model_token = models_tokens["claude"][llm_params["model"]] except KeyError as exc: @@ -203,6 +214,7 @@ def _create_llm(self, llm_config: dict, chat=False) -> object: return Ollama(llm_params) elif "hugging_face" in llm_params["model"]: + llm_params["model"] = llm_params["model"].split("/")[-1] try: self.model_token = models_tokens["hugging_face"][llm_params["model"]] except KeyError: @@ -277,12 +289,13 @@ def _create_default_embedder(self, llm_config=None) -> object: if isinstance(self.llm_model, OpenAI): return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key, base_url=self.llm_model.openai_api_base) elif isinstance(self.llm_model, DeepSeek): - return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key) - + return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key) elif isinstance(self.llm_model, AzureOpenAIEmbeddings): return self.llm_model elif isinstance(self.llm_model, AzureOpenAI): return AzureOpenAIEmbeddings() + elif isinstance(self.llm_model, Fireworks): + return FireworksEmbeddings(model=self.llm_model.model_name) elif isinstance(self.llm_model, Ollama): # unwrap the kwargs from the model whihc is a dict params = self.llm_model._lc_kwargs @@ -333,6 +346,13 @@ def _create_embedder(self, embedder_config: dict) -> object: except KeyError as exc: raise KeyError("Model not supported") from exc return HuggingFaceHubEmbeddings(model=embedder_params["model"]) + elif "fireworks" in embedder_params["model"]: + embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) + try: + models_tokens["fireworks"][embedder_params["model"]] + except KeyError as exc: + raise KeyError("Model not supported") from exc + return FireworksEmbeddings(model=embedder_params["model"]) elif "gemini" in embedder_params["model"]: try: models_tokens["gemini"][embedder_params["model"]] diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py index 39c6682b..543eb91f 100644 --- a/scrapegraphai/helpers/models_tokens.py +++ b/scrapegraphai/helpers/models_tokens.py @@ -148,5 +148,10 @@ "ernie-bot-2-base-en": 4096, "ernie-bot-2-base-en-zh": 4096, "ernie-bot-2-base-zh-en": 4096 - } + }, + "fireworks": { + "llama-v2-7b": 4096, + "mixtral-8x7b-instruct": 4096, + "nomic-ai/nomic-embed-text-v1.5": 8192 + }, } diff --git a/scrapegraphai/models/__init__.py b/scrapegraphai/models/__init__.py index 0a1ad2af..6c90dc0f 100644 --- a/scrapegraphai/models/__init__.py +++ b/scrapegraphai/models/__init__.py @@ -14,3 +14,4 @@ from .anthropic import Anthropic from .deepseek import DeepSeek from .oneapi import OneApi +from .fireworks import Fireworks diff --git a/scrapegraphai/models/fireworks.py b/scrapegraphai/models/fireworks.py new file mode 100644 index 00000000..445c4846 --- /dev/null +++ b/scrapegraphai/models/fireworks.py @@ -0,0 +1,33 @@ +""" +Fireworks Module +""" +from langchain_fireworks import ChatFireworks + + +class Fireworks(ChatFireworks): + """ + Initializes the Fireworks class. + + Args: + llm_config (dict): A dictionary containing configuration parameters for the LLM (required). + The specific keys and values will depend on the LLM implementation + used by the underlying `ChatFireworks` class. Consult its documentation + for details. + + Raises: + ValueError: If required keys are missing from the llm_config dictionary. + """ + + def __init__(self, llm_config: dict): + """ + Initializes the Fireworks class. + + Args: + llm_config (dict): A dictionary containing configuration parameters for the LLM. + The specific keys and values will depend on the LLM implementation. + + Raises: + ValueError: If required keys are missing from the llm_config dictionary. + """ + + super().__init__(**llm_config) diff --git a/tests/graphs/smart_scraper_fireworks_test.py b/tests/graphs/smart_scraper_fireworks_test.py new file mode 100644 index 00000000..9ef58b35 --- /dev/null +++ b/tests/graphs/smart_scraper_fireworks_test.py @@ -0,0 +1,57 @@ +""" +Module for testing the smart scraper class +""" + +import os +import pytest +import pandas as pd +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +@pytest.fixture +def graph_config(): + """Configuration of the graph""" + fireworks_api_key = os.getenv("FIREWORKS_APIKEY") + return { + "llm": { + "api_key": fireworks_api_key, + "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "verbose": True, + "headless": False, + } + +def test_scraping_pipeline(graph_config): + """Start of the scraping pipeline""" + smart_scraper_graph = SmartScraperGraph( + prompt="List me all the projects with their description.", + source="https://perinim.github.io/projects/", + config=graph_config, + ) + + result = smart_scraper_graph.run() + + assert result is not None + assert isinstance(result, dict) + +def test_get_execution_info(graph_config): + """Get the execution info""" + smart_scraper_graph = SmartScraperGraph( + prompt="List me all the projects with their description.", + source="https://perinim.github.io/projects/", + config=graph_config, + ) + + smart_scraper_graph.run() + + graph_exec_info = smart_scraper_graph.get_execution_info() + + assert graph_exec_info is not None