-
Notifications
You must be signed in to change notification settings - Fork 17
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Feat/link suggest #253
Merged
Merged
Feat/link suggest #253
Changes from all commits
Commits
Show all changes
18 commits
Select commit
Hold shift + click to select a range
5678a6c
inital link suggestion work
jacobryas4 e3d055f
updates
jacobryas4 5fe112e
link suggestions functionality
jacobryas4 3bc7699
updating Dapp class
jacobryas4 d616276
skip vectorizing certain fields
jacobryas4 929b661
updated index name
jacobryas4 e1f2c2d
updating db load function
jacobryas4 34bbbfa
adding changes to keep scraped sites and add link suggestion
jacobryas4 dce09b7
commenting out scraped sites, new tool for link suggestion with impro…
jacobryas4 42ccfcb
skip vectorization fix
jacobryas4 6554609
revert previous changes to index_answer
jacobryas4 9f04c10
revert formatting changes in widget.yaml
jacobryas4 beb976a
revert formatting changes in widget.yaml
jacobryas4 1bf1829
removing unneeded fields, skip url vectorize
jacobryas4 a054f4d
upgrading link suggestion prompt
jacobryas4 fe0fb6a
removing comment
jacobryas4 ffb6c33
remove duplicated description and version bump
jacobryas4 0cf4f32
Merge branch 'dev' into feat/link-suggest
jacobryas4 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
# to build the index for dapps, first scrape them using the scraper | ||
# then run: python3 -c "from index.dapps import backfill; backfill()" | ||
|
||
|
||
from langchain.docstore.document import Document | ||
from .weaviate import get_client | ||
import json | ||
|
||
INDEX_NAME = "Web3Apps" | ||
INDEX_DESCRIPTION = "Index of Third party dapps" | ||
DAPP_DESCRIPTION = "description" | ||
DAPP_NAME = "name" | ||
DAPP_URL = "url" | ||
|
||
def delete_schema() -> None: | ||
try: | ||
client = get_client() | ||
client.schema.delete_class(INDEX_NAME) | ||
except Exception as e: | ||
print(f"Error deleting schmea: {str(e)}") | ||
|
||
def create_schema(delete_first: bool = False) -> None: | ||
try: | ||
client = get_client() | ||
if delete_first: | ||
delete_schema() | ||
client.schema.get() | ||
schema = { | ||
"classes": [ | ||
{ | ||
"class": INDEX_NAME, | ||
"description": INDEX_DESCRIPTION, | ||
"vectorizer": "text2vec-openai", | ||
"moduleConfig": { | ||
"text2vec-openai": { | ||
"model": "ada", | ||
"modelVersion": "002", | ||
"type": "text" | ||
} | ||
}, | ||
"properties": [ | ||
{"name": DAPP_NAME, "dataType": ["text"]}, | ||
{"name": DAPP_DESCRIPTION, "dataType": ["text"]}, | ||
{ | ||
"name": DAPP_URL, | ||
"dataType": ["text"], | ||
"description": "The URL of the Dapp", | ||
"moduleConfig": { | ||
"text2vec-openai": { | ||
"skip": True, | ||
"vectorizePropertyName": False | ||
} | ||
} | ||
}, | ||
] | ||
} | ||
|
||
] | ||
} | ||
client.schema.create(schema) | ||
except Exception as e: | ||
print(f"Error creating schema: {str(e)}") | ||
|
||
def backfill(): | ||
try: | ||
from langchain.vectorstores import Weaviate | ||
|
||
with open('./knowledge_base/dapps_ranked_unique.json') as f: | ||
dapp_list = json.load(f) | ||
|
||
documents = [d.pop("description") for d in dapp_list] | ||
|
||
metadatas = dapp_list | ||
|
||
create_schema(delete_first=True) | ||
|
||
client = get_client() | ||
w = Weaviate(client, INDEX_NAME, DAPP_NAME) | ||
w.add_texts(documents, metadatas) | ||
except Exception as e: | ||
print(f"Error during backfill in dapps.py {str(e)}") | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,226 @@ | ||
import os | ||
import argparse | ||
|
||
# Change the current working directory to the parent directory of the script | ||
os.chdir(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | ||
|
||
import requests | ||
import json | ||
from typing import List | ||
|
||
from scrape.models import ( | ||
db_session, | ||
Dapp | ||
) | ||
|
||
BROWSERLESS_API_KEY = os.getenv('BROWSERLESS_API_KEY', '') | ||
SCRAPE_API_URL = f'https://chrome.browserless.io/scrape?token={BROWSERLESS_API_KEY}' | ||
|
||
# Construct the path to the dapps_ranked.json file | ||
dapps_json_path = os.path.join(os.getcwd(), "scrape", "dapps_ranked_unique.json") | ||
|
||
# scrape a URL for IPFS links, return | ||
def get_ipfs_links_from_url(url: str) -> List[str]: | ||
|
||
# specify what elements to return - in this case IFPS links | ||
payload = json.dumps({ | ||
"url": url, | ||
"elements": [ | ||
{ | ||
"selector": "a[href*='ipfs.io']", | ||
}, | ||
], | ||
}) | ||
|
||
# make the request | ||
r = requests.post(SCRAPE_API_URL, headers={ | ||
'Cache-Control': 'no-cache', | ||
'Content-Type': 'application/json', | ||
}, data=payload) | ||
|
||
# response text | ||
response = r.text | ||
|
||
# Parse the JSON string into a dictionary | ||
data = json.loads(response) | ||
|
||
# Access the items in the 'results' key | ||
results = data['data'][0]['results'] | ||
|
||
# instantiate array to hold cleaned URLs | ||
cleaned_ipfs_urls = [] | ||
|
||
# loop through response data, build array of just the IPFS links | ||
for result in results: | ||
href_value = None | ||
for attribute in result["attributes"]: | ||
if attribute["name"] == "href": | ||
href_value = attribute["value"] | ||
break | ||
|
||
if href_value: | ||
cleaned_ipfs_urls.append(href_value) | ||
|
||
# return links arr | ||
return cleaned_ipfs_urls | ||
|
||
|
||
def scrape_ipfs_links(url: str) -> str: | ||
|
||
payload = json.dumps({ | ||
"url": url, | ||
"elements": [ | ||
{ | ||
"selector": "a[href*='ipfs.io']", | ||
}, | ||
], | ||
}) | ||
|
||
r = requests.post(SCRAPE_API_URL, headers={ | ||
'Cache-Control': 'no-cache', | ||
'Content-Type': 'application/json', | ||
}, data=payload) | ||
|
||
# Assuming the value from r.text is stored in the 'response' variable | ||
response = r.text | ||
|
||
# Parse the JSON string into a dictionary | ||
data = json.loads(response) | ||
|
||
# Access the items in the 'results' key from the browserless response | ||
results = data['data'][0]['results'] | ||
cleaned_ipfs_urls = [] | ||
|
||
for result in results: | ||
href_value = None | ||
for attribute in result["attributes"]: | ||
if attribute["name"] == "href": | ||
href_value = attribute["value"] | ||
break | ||
|
||
if href_value: | ||
cleaned_ipfs_urls.append(href_value) | ||
|
||
headers = { | ||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', | ||
'Accept-Language': 'en-US,en;q=0.9', | ||
'Cache-Control': 'no-cache', | ||
'Pragma': 'no-cache', | ||
'Sec-Ch-Ua': '"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"', | ||
'Sec-Ch-Ua-Mobile': '?0', | ||
'Sec-Ch-Ua-Platform': '"macOS"', | ||
'Sec-Fetch-Dest': 'document', | ||
'Sec-Fetch-Mode': 'navigate', | ||
'Sec-Fetch-Site': 'cross-site', | ||
'Sec-Fetch-User': '?1', | ||
'Upgrade-Insecure-Requests': '1', | ||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36' | ||
} | ||
|
||
responses = [] | ||
|
||
# here we take the scraped CIDs and pull info for each dapp | ||
# from cloudflare's public IPFS gateway | ||
with requests.Session() as session: | ||
for url in cleaned_ipfs_urls: | ||
CID = get_url_suffix(url) | ||
IPFS_URL = f"https://cloudflare-ipfs.com/ipfs/{CID}" | ||
try: | ||
response = session.get(IPFS_URL, headers=headers, timeout=30) | ||
if response.status_code == 200: | ||
# process the response | ||
responses.append(response.content) | ||
pass | ||
else: | ||
print(f"Failed to retrieve {url}. Status code: {response.status_code}") | ||
except requests.RequestException as e: | ||
print(f"Error fetching {url}: {e}") | ||
|
||
# convert bytes objects to strings and load them as JSON | ||
responses_str = [json.loads(response.decode()) for response in responses] | ||
|
||
# Save the responses array to a new json file called 'dapp-list.json' | ||
with open('dapp-list.json', 'w') as f: | ||
json.dump(clean_payload_data(responses_str), f, ensure_ascii=False) | ||
|
||
|
||
|
||
# a util function that, in this case, will get us the IPFS CID | ||
def get_url_suffix(url: str) -> str: | ||
return url.rsplit('/', 1)[-1] | ||
|
||
# Function to further clean the original JSON data by focusing only on the 'payload' property of 'msg' | ||
def clean_payload_data(original_data): | ||
# Extract and parse the 'msg' fields, then extract the 'payload' property | ||
cleaned_payload_data = [json.loads(item.get('msg', '{}')).get('payload', {}) for item in original_data] | ||
|
||
# reduce each obj to just a few properties that we need | ||
reduced_data = [] | ||
for dapp in cleaned_payload_data: | ||
cleaned_dapp = { | ||
"name": dapp["name"], | ||
"description": dapp["description"], | ||
"url": dapp["url"], | ||
"twitterHandle": dapp["twitterHandle"], | ||
"blogLinks": dapp["blogLinks"], | ||
"discord": dapp["socialLinks"]["discord"], | ||
"facebook": dapp["socialLinks"]["facebook"], | ||
"instagram": dapp["socialLinks"]["instagram"], | ||
"telegram": dapp["socialLinks"]["telegram"] | ||
} | ||
reduced_data.append(cleaned_dapp) | ||
|
||
return reduced_data | ||
|
||
|
||
def load_data_from_json_to_db(session=db_session, json_path=dapps_json_path): | ||
print("Loading data from JSON to DB") | ||
# 1. Setup | ||
# If the table doesn't exist, create it | ||
# Base.metadata.create_all(session.bind) Dont need this - jacob b | ||
|
||
# 2. Data Loading | ||
|
||
# Read the JSON data | ||
with open(json_path, "r") as file: | ||
dapps_data = json.load(file) | ||
|
||
# Loop through the JSON data and insert each entry into the database | ||
for dapp in dapps_data: | ||
print(f'adding {dapp["name"]}') | ||
dapp_instance = Dapp( | ||
description=dapp["description"], | ||
name=dapp["name"], | ||
url=dapp["url"], | ||
twitter_handle=dapp["twitterHandle"], | ||
blog_links=dapp["blogLinks"], | ||
discord=dapp["discord"], | ||
facebook=dapp["facebook"], | ||
instagram=dapp["instagram"], | ||
telegram=dapp["telegram"] | ||
) | ||
session.add(dapp_instance) | ||
|
||
# 3. Finalization | ||
|
||
# Commit the transactions | ||
session.commit() | ||
|
||
print("Finished loading data from JSON to DB") | ||
|
||
|
||
|
||
if __name__ == "__main__": | ||
|
||
# create an ArgumentParser instance | ||
parser = argparse.ArgumentParser(description='Run functions in the dapp_scraper module.') | ||
|
||
# Add an argument for the load_data_from_json_to_db function | ||
parser.add_argument('--load', action='store_true', help='Run the load_data_from_json_to_db function.') | ||
|
||
# Parse the command line arguments | ||
args = parser.parse_args() | ||
|
||
# If the --load argument is present, call the load_data_from_json_to_db function | ||
if args.load: | ||
load_data_from_json_to_db() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.