Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/link suggest #253

Merged
merged 18 commits into from
Sep 22, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,14 @@
text_key="content",
extra_keys=["url"],
)

dapps_index = dict(
type="index.weaviate.WeaviateIndex",
index_name="ThirdPartyDapps",
jacobryas4 marked this conversation as resolved.
Show resolved Hide resolved
text_key="description",
extra_keys=["url","name"],
)

api_docs_index = dict(
type="index.weaviate.WeaviateIndex",
index_name="APIDocsV1",
Expand Down
106 changes: 106 additions & 0 deletions index/dapps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# to build the index for dapps, first scrape them using the scraper
# then run: python3 -c "from index.dapps import backfill; backfill()"


from langchain.docstore.document import Document
from .weaviate import get_client
import json

INDEX_NAME = "ThirdPartyDapps"
INDEX_DESCRIPTION = "Index of Third party dapps"
DAPP_DESCRIPTION = "description"
DAPP_NAME = "name"
DAPP_URL = "url"

def delete_schema() -> None:
try:
client = get_client()
client.schema.delete_class(INDEX_NAME)
except Exception as e:
print(f"Error deleting schmea: {str(e)}")

def create_schema(delete_first: bool = False) -> None:
try:
client = get_client()
if delete_first:
delete_schema()
client.schema.get()
schema = {
"classes": [
{
"class": INDEX_NAME,
"description": INDEX_DESCRIPTION,
"vectorizer": "text2vec-openai",
"moduleConfig": {
"text2vec-openai": {
"model": "ada",
"modelVersion": "002",
"type": "text"
}
},
"properties": [
{"name": DAPP_NAME, "dataType": ["text"]},
{"name": DAPP_DESCRIPTION, "dataType": ["text"]},
{"name": DAPP_URL, "dataType": ["text"]},
{
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • let's just remove the social media links as they will not be used and unnecessarily will occupy space
  • DAPP_URL does not need to be vectorized so can be skipped

"name": "twitterHandle",
"dataType": ["text"],
"description": "The Twitter handle of the Dapp"
},
{
jacobryas4 marked this conversation as resolved.
Show resolved Hide resolved
"name": "blogLinks",
"dataType": ["text[]"],
"description": "Links to the blog posts related to the Dapp"
},
{
"name": "discord",
"dataType": ["text"],
"description": "The Discord server link of the Dapp"
},
{
"name": "facebook",
"dataType": ["text"],
"description": "The Facebook page link of the Dapp"
},
{
"name": "instagram",
"dataType": ["text"],
"description": "The Instagram profile link of the Dapp"
},
{
"name": "telegram",
"dataType": ["text"],
"description": "The Telegram channel link of the Dapp"
}
]
}

]
}
client.schema.create(schema)
except Exception as e:
print(f"Error creating schema: {str(e)}")

def backfill():
try:
from langchain.vectorstores import Weaviate

with open('./knowledge_base/dapps_ranked.json') as f:
dapp_list = json.load(f)

# Extract the 'name' field from each dapp and store it in the 'documents' list
documents = [d.pop("name") for d in dapp_list]
sgzsh269 marked this conversation as resolved.
Show resolved Hide resolved

# Use the remaining fields in each dapp to populate the 'metadatas' list
# is this the best 'metadatas' to use?
metadatas = dapp_list

create_schema(delete_first=True)

client = get_client()
w = Weaviate(client, INDEX_NAME, DAPP_NAME) # is this a proper 3rd argument?
w.add_texts(documents, metadatas)
except Exception as e:
print(f"Error during backfill in dapps.py {str(e)}")


193 changes: 193 additions & 0 deletions scrape/dapp_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
import requests
import json
import os
from typing import List

BROWSERLESS_API_KEY = os.getenv('BROWSERLESS_API_KEY', '')
SCRAPE_API_URL = f'https://chrome.browserless.io/scrape?token={BROWSERLESS_API_KEY}'

# scrape a URL for IPFS links, return
def get_ipfs_links_from_url(url: str) -> List[str]:

# specify what elements to return - in this case IFPS links
payload = json.dumps({
"url": url,
"elements": [
{
"selector": "a[href*='ipfs.io']",
},
],
})

# make the request
r = requests.post(SCRAPE_API_URL, headers={
'Cache-Control': 'no-cache',
'Content-Type': 'application/json',
}, data=payload)

# response text
response = r.text

# Parse the JSON string into a dictionary
data = json.loads(response)

# Access the items in the 'results' key
results = data['data'][0]['results']

# instantiate array to hold cleaned URLs
cleaned_ipfs_urls = []

# loop through response data, build array of just the IPFS links
for result in results:
href_value = None
for attribute in result["attributes"]:
if attribute["name"] == "href":
href_value = attribute["value"]
break

if href_value:
cleaned_ipfs_urls.append(href_value)

# return links arr
return cleaned_ipfs_urls


def scrape_ipfs_links(url: str) -> str:

payload = json.dumps({
"url": url,
"elements": [
{
"selector": "a[href*='ipfs.io']",
},
],
})

r = requests.post(SCRAPE_API_URL, headers={
'Cache-Control': 'no-cache',
'Content-Type': 'application/json',
}, data=payload)

# Assuming the value from r.text is stored in the 'response' variable
response = r.text

# Parse the JSON string into a dictionary
data = json.loads(response)

# Access the items in the 'results' key from the browserless response
results = data['data'][0]['results']
cleaned_ipfs_urls = []

for result in results:
href_value = None
for attribute in result["attributes"]:
if attribute["name"] == "href":
href_value = attribute["value"]
break

if href_value:
cleaned_ipfs_urls.append(href_value)

headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'en-US,en;q=0.9',
'Cache-Control': 'no-cache',
'Pragma': 'no-cache',
'Sec-Ch-Ua': '"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': '"macOS"',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'cross-site',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
}

responses = []

# here we take the scraped CIDs and pull info for each dapp
# from cloudflare's public IPFS gateway
with requests.Session() as session:
for url in cleaned_ipfs_urls:
CID = get_url_suffix(url)
IPFS_URL = f"https://cloudflare-ipfs.com/ipfs/{CID}"
try:
response = session.get(IPFS_URL, headers=headers, timeout=30)
if response.status_code == 200:
# process the response
responses.append(response.content)
pass
else:
print(f"Failed to retrieve {url}. Status code: {response.status_code}")
except requests.RequestException as e:
print(f"Error fetching {url}: {e}")

# convert bytes objects to strings and load them as JSON
responses_str = [json.loads(response.decode()) for response in responses]

# Save the responses array to a new json file called 'dapp-list.json'
with open('dapp-list.json', 'w') as f:
json.dump(clean_payload_data(responses_str), f, ensure_ascii=False)



# a util function that, in this case, will get us the IPFS CID
def get_url_suffix(url: str) -> str:
return url.rsplit('/', 1)[-1]

# Function to further clean the original JSON data by focusing only on the 'payload' property of 'msg'
def clean_payload_data(original_data):
# Extract and parse the 'msg' fields, then extract the 'payload' property
cleaned_payload_data = [json.loads(item.get('msg', '{}')).get('payload', {}) for item in original_data]

# reduce each obj to just a few properties that we need
reduced_data = []
for dapp in cleaned_payload_data:
cleaned_dapp = {
"name": dapp["name"],
"description": dapp["description"],
"url": dapp["url"],
"twitterHandle": dapp["twitterHandle"],
"blogLinks": dapp["blogLinks"],
"discord": dapp["socialLinks"]["discord"],
"facebook": dapp["socialLinks"]["facebook"],
"instagram": dapp["socialLinks"]["instagram"],
"telegram": dapp["socialLinks"]["telegram"]
}
reduced_data.append(cleaned_dapp)

return reduced_data


def load_data_from_json_to_db(session, json_path):
# 1. Setup
# If the table doesn't exist, create it
# Base.metadata.create_all(session.bind) Dont need this - jacob b

# 2. Data Loading

# Read the JSON data
with open(json_path, "r") as file:
dapps_data = json.load(file)

# Loop through the JSON data and insert each entry into the database
for dapp in dapps_data:
dapp_instance = DApp(
description=dapp["description"],
name=dapp["name"],
url=dapp["url"],
twitter_handle=dapp["twitterHandle"],
blog_links=dapp["blogLinks"],
discord=dapp["discord"],
facebook=dapp["facebook"],
instagram=dapp["instagram"],
telegram=dapp["telegram"]
)
session.add(dapp_instance)

# 3. Finalization

# Commit the transactions
session.commit()

18 changes: 17 additions & 1 deletion scrape/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from sqlalchemy.orm import ( # type: ignore
scoped_session, sessionmaker, relationship,
backref)
from sqlalchemy.dialects.postgresql import UUID, JSONB
from sqlalchemy.dialects.postgresql import UUID, JSONB, ARRAY, TEXT
from sqlalchemy.ext.declarative import declarative_base # type: ignore
from sqlalchemy_utils import ChoiceType, Timestamp # type: ignore

Expand All @@ -35,3 +35,19 @@ class ScrapedUrl(Base, Timestamp): # type: ignore
data = Column(JSONB, nullable=False)

Index('scraped_url_lookup', url, unique=True)

class Dapp(Base, Timestamp):
__tablename__ = 'dapp'

id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
description = Column(TEXT, nullable=False)
name = Column(String(255), nullable=False, unique=True)
url = Column(String(255), nullable=False)
twitter_handle = Column(String(255), nullable=True)
blog_links = Column(ARRAY(String(255)), nullable=True)
discord = Column(String(255), nullable=True)
facebook = Column(String(255), nullable=True)
instagram = Column(String(255), nullable=True)
telegram = Column(String(255), nullable=True)

Index('dapp_by_name', 'name', unique=True)
2 changes: 1 addition & 1 deletion tools/index_answer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

TEMPLATE = '''You are a web3 assistant. You help users with answering web3-related questions. Your responses should sound natural, helpful, cheerful, and engaging, and you should use easy to understand language with explanations for jargon.

Information to help complete your task is below. Only use information below to answer the question, and create a final answer with inline citations linked to the provided source URLs. If you don't know the answer, just say that you don't know. Don't try to make up an answer. ALWAYS return a "SOURCES" part in your answer corresponding to the numbered inline citations.
Information to help complete your task is below. Only use information below to answer the question, and create a final answer with inline citations linked to the provided source URLs. If you don't know the answer, just say that you don't know. Don't try to make up an answer. ALWAYS return a "SOURCES" part in your answer corresponding to the numbered inline citations. ALWAYS provide a link to each citation in SOURCES.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • Similar to having a separate function handler for link suggestion different from scraped sites, maybe better to also have a separate tools module like index_dapp_suggesstion or something

  • Trying to figure out how consistently the LLM is able to cite SOURCES in a reliable format, any insights on that?
    We could also explore a more structured format for the results like a list of JSON objects with fields such as dapp_name, dapp_desc, dapp_url etc and then post-process it or send it to the FE as JSON to render it consistently

---
{task_info}
---
Expand Down
7 changes: 5 additions & 2 deletions tools/index_widget.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,13 +355,16 @@ def fn(token_handler):
@error_wrap
def fetch_scraped_sites(query: str) -> Callable:
def fn(token_handler):
scraped_sites_index = config.initialize(config.scraped_sites_index)
# below is the old index used previously for scraped sites - do we still use this?
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think there is some overlap between both link suggestion and scraped sites modules as both seek to answer generic questions about the ecosystem.
If we use both of them, I suspect the model may not be able to reliably discern which one to use appropriately

Can you test this by having both enabled and see how the system performs? To test both of them, you will need to a have a separate widget command for each in the yaml

You can also remove your comments.

# if so, should we make a different function for the new dapps_index?
# scraped_sites_index = config.initialize(config.scraped_sites_index)
dapps_index = config.initialize(config.dapps_index)
sgzsh269 marked this conversation as resolved.
Show resolved Hide resolved
tool = dict(
type="tools.index_answer.IndexAnswerTool",
_streaming=True,
name="ScrapedSitesIndexAnswer",
content_description="", # not used
index=scraped_sites_index,
index=dapps_index,
top_k=3,
source_key="url",
)
Expand Down
Loading