Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/link suggest #253

Merged
merged 18 commits into from
Sep 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,14 @@
text_key="content",
extra_keys=["url"],
)

dapps_index = dict(
type="index.weaviate.WeaviateIndex",
index_name="Web3Apps",
text_key="description",
extra_keys=["url","name"],
)

api_docs_index = dict(
type="index.weaviate.WeaviateIndex",
index_name="APIDocsV1",
Expand Down
1 change: 1 addition & 0 deletions eval/eval_widgets.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,4 @@ fetch_eth_out
display_uniswap
fetch_nfts_owned_by_address_or_domain
fetch_nfts_owned_by_user
fetch_link_suggestion
83 changes: 83 additions & 0 deletions index/dapps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# to build the index for dapps, first scrape them using the scraper
# then run: python3 -c "from index.dapps import backfill; backfill()"


from langchain.docstore.document import Document
from .weaviate import get_client
import json

INDEX_NAME = "Web3Apps"
INDEX_DESCRIPTION = "Index of Third party dapps"
DAPP_DESCRIPTION = "description"
DAPP_NAME = "name"
DAPP_URL = "url"

def delete_schema() -> None:
try:
client = get_client()
client.schema.delete_class(INDEX_NAME)
except Exception as e:
print(f"Error deleting schmea: {str(e)}")

def create_schema(delete_first: bool = False) -> None:
try:
client = get_client()
if delete_first:
delete_schema()
client.schema.get()
schema = {
"classes": [
{
"class": INDEX_NAME,
"description": INDEX_DESCRIPTION,
"vectorizer": "text2vec-openai",
"moduleConfig": {
"text2vec-openai": {
"model": "ada",
"modelVersion": "002",
"type": "text"
}
},
"properties": [
{"name": DAPP_NAME, "dataType": ["text"]},
{"name": DAPP_DESCRIPTION, "dataType": ["text"]},
{
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • let's just remove the social media links as they will not be used and unnecessarily will occupy space
  • DAPP_URL does not need to be vectorized so can be skipped

"name": DAPP_URL,
"dataType": ["text"],
"description": "The URL of the Dapp",
"moduleConfig": {
"text2vec-openai": {
"skip": True,
"vectorizePropertyName": False
}
}
},
]
}

]
}
client.schema.create(schema)
except Exception as e:
print(f"Error creating schema: {str(e)}")

def backfill():
try:
from langchain.vectorstores import Weaviate

with open('./knowledge_base/dapps_ranked_unique.json') as f:
dapp_list = json.load(f)

documents = [d.pop("description") for d in dapp_list]

metadatas = dapp_list

create_schema(delete_first=True)

client = get_client()
w = Weaviate(client, INDEX_NAME, DAPP_NAME)
w.add_texts(documents, metadatas)
except Exception as e:
print(f"Error during backfill in dapps.py {str(e)}")


25 changes: 22 additions & 3 deletions knowledge_base/widgets.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -443,8 +443,27 @@
type: object
return_value_description: an answer to the question, with suggested followup questions
if available
- _name_: fetch_scraped_sites
description: Answer questions using general content scraped from web3 sites. It
#
# deprecating scraped sites in favor of link suggestion - leaving for future reference
# in case we ever want to revive
#
# - _name_: fetch_scraped_sites
# description: Answer questions using general content scraped from web3 sites. It
# does not know about this app or about widget magic commands for invoking transactions
# or fetching data about specific things like NFTs or balances.
# parameters:
# properties:
# query:
# description: a standalone question representing information to be retrieved
# from the index.
# type: string
# required:
# - query
# type: object
# return_value_description: a summarized answer with source citations
- _name_: fetch_link_suggestion
description:
Answer questions using general content scraped from web3 sites. It
does not know about this app or about widget magic commands for invoking transactions
or fetching data about specific things like NFTs or balances.
parameters:
Expand All @@ -456,7 +475,7 @@
required:
- query
type: object
return_value_description: a summarized answer with source citations
return_value_description: a summarized answer with source citations and links
- _name_: display_zksync_deposit
description: Used to bridge and deposit tokens from mainnet L1 to zksync L2.
parameters:
Expand Down
226 changes: 226 additions & 0 deletions scrape/dapp_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
import os
import argparse

# Change the current working directory to the parent directory of the script
os.chdir(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

import requests
import json
from typing import List

from scrape.models import (
db_session,
Dapp
)

BROWSERLESS_API_KEY = os.getenv('BROWSERLESS_API_KEY', '')
SCRAPE_API_URL = f'https://chrome.browserless.io/scrape?token={BROWSERLESS_API_KEY}'

# Construct the path to the dapps_ranked.json file
dapps_json_path = os.path.join(os.getcwd(), "scrape", "dapps_ranked_unique.json")

# scrape a URL for IPFS links, return
def get_ipfs_links_from_url(url: str) -> List[str]:

# specify what elements to return - in this case IFPS links
payload = json.dumps({
"url": url,
"elements": [
{
"selector": "a[href*='ipfs.io']",
},
],
})

# make the request
r = requests.post(SCRAPE_API_URL, headers={
'Cache-Control': 'no-cache',
'Content-Type': 'application/json',
}, data=payload)

# response text
response = r.text

# Parse the JSON string into a dictionary
data = json.loads(response)

# Access the items in the 'results' key
results = data['data'][0]['results']

# instantiate array to hold cleaned URLs
cleaned_ipfs_urls = []

# loop through response data, build array of just the IPFS links
for result in results:
href_value = None
for attribute in result["attributes"]:
if attribute["name"] == "href":
href_value = attribute["value"]
break

if href_value:
cleaned_ipfs_urls.append(href_value)

# return links arr
return cleaned_ipfs_urls


def scrape_ipfs_links(url: str) -> str:

payload = json.dumps({
"url": url,
"elements": [
{
"selector": "a[href*='ipfs.io']",
},
],
})

r = requests.post(SCRAPE_API_URL, headers={
'Cache-Control': 'no-cache',
'Content-Type': 'application/json',
}, data=payload)

# Assuming the value from r.text is stored in the 'response' variable
response = r.text

# Parse the JSON string into a dictionary
data = json.loads(response)

# Access the items in the 'results' key from the browserless response
results = data['data'][0]['results']
cleaned_ipfs_urls = []

for result in results:
href_value = None
for attribute in result["attributes"]:
if attribute["name"] == "href":
href_value = attribute["value"]
break

if href_value:
cleaned_ipfs_urls.append(href_value)

headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'en-US,en;q=0.9',
'Cache-Control': 'no-cache',
'Pragma': 'no-cache',
'Sec-Ch-Ua': '"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': '"macOS"',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'cross-site',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
}

responses = []

# here we take the scraped CIDs and pull info for each dapp
# from cloudflare's public IPFS gateway
with requests.Session() as session:
for url in cleaned_ipfs_urls:
CID = get_url_suffix(url)
IPFS_URL = f"https://cloudflare-ipfs.com/ipfs/{CID}"
try:
response = session.get(IPFS_URL, headers=headers, timeout=30)
if response.status_code == 200:
# process the response
responses.append(response.content)
pass
else:
print(f"Failed to retrieve {url}. Status code: {response.status_code}")
except requests.RequestException as e:
print(f"Error fetching {url}: {e}")

# convert bytes objects to strings and load them as JSON
responses_str = [json.loads(response.decode()) for response in responses]

# Save the responses array to a new json file called 'dapp-list.json'
with open('dapp-list.json', 'w') as f:
json.dump(clean_payload_data(responses_str), f, ensure_ascii=False)



# a util function that, in this case, will get us the IPFS CID
def get_url_suffix(url: str) -> str:
return url.rsplit('/', 1)[-1]

# Function to further clean the original JSON data by focusing only on the 'payload' property of 'msg'
def clean_payload_data(original_data):
# Extract and parse the 'msg' fields, then extract the 'payload' property
cleaned_payload_data = [json.loads(item.get('msg', '{}')).get('payload', {}) for item in original_data]

# reduce each obj to just a few properties that we need
reduced_data = []
for dapp in cleaned_payload_data:
cleaned_dapp = {
"name": dapp["name"],
"description": dapp["description"],
"url": dapp["url"],
"twitterHandle": dapp["twitterHandle"],
"blogLinks": dapp["blogLinks"],
"discord": dapp["socialLinks"]["discord"],
"facebook": dapp["socialLinks"]["facebook"],
"instagram": dapp["socialLinks"]["instagram"],
"telegram": dapp["socialLinks"]["telegram"]
}
reduced_data.append(cleaned_dapp)

return reduced_data


def load_data_from_json_to_db(session=db_session, json_path=dapps_json_path):
print("Loading data from JSON to DB")
# 1. Setup
# If the table doesn't exist, create it
# Base.metadata.create_all(session.bind) Dont need this - jacob b

# 2. Data Loading

# Read the JSON data
with open(json_path, "r") as file:
dapps_data = json.load(file)

# Loop through the JSON data and insert each entry into the database
for dapp in dapps_data:
print(f'adding {dapp["name"]}')
dapp_instance = Dapp(
description=dapp["description"],
name=dapp["name"],
url=dapp["url"],
twitter_handle=dapp["twitterHandle"],
blog_links=dapp["blogLinks"],
discord=dapp["discord"],
facebook=dapp["facebook"],
instagram=dapp["instagram"],
telegram=dapp["telegram"]
)
session.add(dapp_instance)

# 3. Finalization

# Commit the transactions
session.commit()

print("Finished loading data from JSON to DB")



if __name__ == "__main__":

# create an ArgumentParser instance
parser = argparse.ArgumentParser(description='Run functions in the dapp_scraper module.')

# Add an argument for the load_data_from_json_to_db function
parser.add_argument('--load', action='store_true', help='Run the load_data_from_json_to_db function.')

# Parse the command line arguments
args = parser.parse_args()

# If the --load argument is present, call the load_data_from_json_to_db function
if args.load:
load_data_from_json_to_db()
18 changes: 17 additions & 1 deletion scrape/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from sqlalchemy.orm import ( # type: ignore
scoped_session, sessionmaker, relationship,
backref)
from sqlalchemy.dialects.postgresql import UUID, JSONB
from sqlalchemy.dialects.postgresql import UUID, JSONB, ARRAY, TEXT
from sqlalchemy.ext.declarative import declarative_base # type: ignore
from sqlalchemy_utils import ChoiceType, Timestamp # type: ignore

Expand All @@ -35,3 +35,19 @@ class ScrapedUrl(Base, Timestamp): # type: ignore
data = Column(JSONB, nullable=False)

Index('scraped_url_lookup', url, unique=True)

class Dapp(Base, Timestamp):
__tablename__ = 'dapp'

id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
description = Column(TEXT, nullable=False)
name = Column(String(255), nullable=False, unique=True)
url = Column(String(255), nullable=False)
twitter_handle = Column(String(255), nullable=True)
blog_links = Column(ARRAY(String(255)), nullable=True)
discord = Column(String(255), nullable=True)
facebook = Column(String(255), nullable=True)
instagram = Column(String(255), nullable=True)
telegram = Column(String(255), nullable=True)

Index('dapp_by_name', 'name', unique=True)
Loading
Loading