Merge pull request #253 from yieldprotocol/feat/link-suggest

Feat/link suggest
yieldprotocol · Sep 22, 2023 · cc84ec3 · cc84ec3
2 parents af0bc08 + 0cf4f32
commit cc84ec3
Show file tree

Hide file tree

Showing 10 changed files with 468 additions and 13 deletions.
diff --git a/config.py b/config.py
@@ -19,6 +19,14 @@
     text_key="content",
     extra_keys=["url"],
 )
+
+dapps_index = dict(
+    type="index.weaviate.WeaviateIndex",
+    index_name="Web3Apps",
+    text_key="description",
+    extra_keys=["url","name"],
+)
+
 api_docs_index = dict(
     type="index.weaviate.WeaviateIndex",
     index_name="APIDocsV1",

diff --git a/eval/eval_widgets.txt b/eval/eval_widgets.txt
@@ -33,3 +33,4 @@ fetch_eth_out
 display_uniswap
 fetch_nfts_owned_by_address_or_domain
 fetch_nfts_owned_by_user
+fetch_link_suggestion
diff --git a/index/dapps.py b/index/dapps.py
@@ -0,0 +1,83 @@
+# to build the index for dapps, first scrape them using the scraper
+# then run: python3 -c "from index.dapps import backfill; backfill()"
+
+
+from langchain.docstore.document import Document
+from .weaviate import get_client
+import json
+
+INDEX_NAME = "Web3Apps"
+INDEX_DESCRIPTION = "Index of Third party dapps"
+DAPP_DESCRIPTION = "description"
+DAPP_NAME = "name"
+DAPP_URL = "url"
+
+def delete_schema() -> None:
+    try: 
+        client = get_client()
+        client.schema.delete_class(INDEX_NAME)
+    except Exception as e: 
+        print(f"Error deleting schmea: {str(e)}")
+
+def create_schema(delete_first: bool = False) -> None:
+    try: 
+        client = get_client()
+        if delete_first: 
+            delete_schema() 
+        client.schema.get()
+        schema = {
+            "classes": [
+                {
+                    "class": INDEX_NAME,
+                    "description": INDEX_DESCRIPTION,
+                    "vectorizer": "text2vec-openai",
+                    "moduleConfig": {
+                        "text2vec-openai": {
+                            "model": "ada",
+                            "modelVersion": "002",
+                            "type": "text"
+                        }
+                    },
+                    "properties": [
+                        {"name": DAPP_NAME, "dataType": ["text"]},
+                        {"name": DAPP_DESCRIPTION, "dataType": ["text"]},
+                        {
+                            "name": DAPP_URL, 
+                            "dataType": ["text"],
+                            "description": "The URL of the Dapp",
+                            "moduleConfig": {
+                                    "text2vec-openai": {
+                                    "skip": True,
+                                    "vectorizePropertyName": False
+                                }
+                            }
+                        },
+                    ]
+                }
+
+            ]
+        }
+        client.schema.create(schema)
+    except Exception as e:
+        print(f"Error creating schema: {str(e)}")
+
+def backfill():
+    try: 
+        from langchain.vectorstores import Weaviate
+
+        with open('./knowledge_base/dapps_ranked_unique.json') as f: 
+            dapp_list = json.load(f)
+
+        documents = [d.pop("description") for d in dapp_list]
+
+        metadatas = dapp_list
+
+        create_schema(delete_first=True)
+
+        client = get_client()
+        w = Weaviate(client, INDEX_NAME, DAPP_NAME) 
+        w.add_texts(documents, metadatas)
+    except Exception as e: 
+        print(f"Error during backfill in dapps.py {str(e)}")
+
+
diff --git a/knowledge_base/widgets.yaml b/knowledge_base/widgets.yaml
@@ -443,8 +443,27 @@
     type: object
   return_value_description: an answer to the question, with suggested followup questions
     if available
-- _name_: fetch_scraped_sites
-  description: Answer questions using general content scraped from web3 sites. It
+#
+# deprecating scraped sites in favor of link suggestion - leaving for future reference
+# in case we ever want to revive
+#
+# - _name_: fetch_scraped_sites
+#   description: Answer questions using general content scraped from web3 sites. It
+#     does not know about this app or about widget magic commands for invoking transactions
+#     or fetching data about specific things like NFTs or balances.
+#   parameters:
+#     properties:
+#       query:
+#         description: a standalone question representing information to be retrieved
+#           from the index.
+#         type: string
+#     required:
+#     - query
+#     type: object
+#   return_value_description: a summarized answer with source citations
+- _name_: fetch_link_suggestion
+  description:
+    Answer questions using general content scraped from web3 sites. It
     does not know about this app or about widget magic commands for invoking transactions
     or fetching data about specific things like NFTs or balances.
   parameters:
@@ -456,7 +475,7 @@
     required:
     - query
     type: object
-  return_value_description: a summarized answer with source citations
+  return_value_description: a summarized answer with source citations and links
 - _name_: display_zksync_deposit
   description: Used to bridge and deposit tokens from mainnet L1 to zksync L2.
   parameters:

diff --git a/scrape/dapp_scraper.py b/scrape/dapp_scraper.py
@@ -0,0 +1,226 @@
+import os
+import argparse
+
+# Change the current working directory to the parent directory of the script
+os.chdir(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import requests
+import json
+from typing import List
+
+from scrape.models import (
+    db_session,
+    Dapp
+)
+
+BROWSERLESS_API_KEY = os.getenv('BROWSERLESS_API_KEY', '')
+SCRAPE_API_URL = f'https://chrome.browserless.io/scrape?token={BROWSERLESS_API_KEY}'
+
+# Construct the path to the dapps_ranked.json file
+dapps_json_path = os.path.join(os.getcwd(), "scrape", "dapps_ranked_unique.json")
+
+# scrape a URL for IPFS links, return 
+def get_ipfs_links_from_url(url: str) -> List[str]:
+
+    # specify what elements to return - in this case IFPS links
+    payload = json.dumps({
+        "url": url,
+        "elements": [
+            {
+                "selector": "a[href*='ipfs.io']",
+            },
+        ],
+    })
+
+    # make the request
+    r = requests.post(SCRAPE_API_URL, headers={
+                'Cache-Control': 'no-cache',
+                'Content-Type': 'application/json',
+            }, data=payload)
+
+    # response text
+    response = r.text
+
+    # Parse the JSON string into a dictionary
+    data = json.loads(response)
+
+    # Access the items in the 'results' key
+    results = data['data'][0]['results']
+
+    # instantiate array to hold cleaned URLs
+    cleaned_ipfs_urls = []
+
+    # loop through response data, build array of just the IPFS links
+    for result in results:
+        href_value = None
+        for attribute in result["attributes"]:
+            if attribute["name"] == "href":
+                href_value = attribute["value"]
+                break
+
+        if href_value: 
+            cleaned_ipfs_urls.append(href_value)
+
+    # return links arr
+    return cleaned_ipfs_urls
+
+
+def scrape_ipfs_links(url: str) -> str:
+
+    payload = json.dumps({
+        "url": url,
+        "elements": [
+            {
+                "selector": "a[href*='ipfs.io']",
+            },
+        ],
+    })
+
+    r = requests.post(SCRAPE_API_URL, headers={
+                'Cache-Control': 'no-cache',
+                'Content-Type': 'application/json',
+            }, data=payload)
+
+    # Assuming the value from r.text is stored in the 'response' variable
+    response = r.text
+
+    # Parse the JSON string into a dictionary
+    data = json.loads(response)
+
+    # Access the items in the 'results' key from the browserless response
+    results = data['data'][0]['results']
+    cleaned_ipfs_urls = []
+
+    for result in results:
+        href_value = None
+        for attribute in result["attributes"]:
+            if attribute["name"] == "href":
+                href_value = attribute["value"]
+                break
+
+        if href_value: 
+            cleaned_ipfs_urls.append(href_value)
+
+    headers = {
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+        'Accept-Language': 'en-US,en;q=0.9',
+        'Cache-Control': 'no-cache',
+        'Pragma': 'no-cache',
+        'Sec-Ch-Ua': '"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"',
+        'Sec-Ch-Ua-Mobile': '?0',
+        'Sec-Ch-Ua-Platform': '"macOS"',
+        'Sec-Fetch-Dest': 'document',
+        'Sec-Fetch-Mode': 'navigate',
+        'Sec-Fetch-Site': 'cross-site',
+        'Sec-Fetch-User': '?1',
+        'Upgrade-Insecure-Requests': '1',
+        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
+    }
+
+    responses = []
+
+    # here we take the scraped CIDs and pull info for each dapp
+    # from cloudflare's public IPFS gateway
+    with requests.Session() as session:
+        for url in cleaned_ipfs_urls:
+            CID = get_url_suffix(url)
+            IPFS_URL = f"https://cloudflare-ipfs.com/ipfs/{CID}"
+            try:
+                response = session.get(IPFS_URL, headers=headers, timeout=30)
+                if response.status_code == 200:
+                    # process the response
+                    responses.append(response.content)
+                    pass
+                else:
+                    print(f"Failed to retrieve {url}. Status code: {response.status_code}")
+            except requests.RequestException as e:
+                print(f"Error fetching {url}: {e}")
+
+    # convert bytes objects to strings and load them as JSON
+    responses_str = [json.loads(response.decode()) for response in responses]
+
+    # Save the responses array to a new json file called 'dapp-list.json'
+    with open('dapp-list.json', 'w') as f:
+        json.dump(clean_payload_data(responses_str), f, ensure_ascii=False)
+
+
+
+# a util function that, in this case, will get us the IPFS CID
+def get_url_suffix(url: str) -> str:
+    return url.rsplit('/', 1)[-1]
+
+# Function to further clean the original JSON data by focusing only on the 'payload' property of 'msg'
+def clean_payload_data(original_data):
+    # Extract and parse the 'msg' fields, then extract the 'payload' property
+    cleaned_payload_data = [json.loads(item.get('msg', '{}')).get('payload', {}) for item in original_data]
+
+    # reduce each obj to just a few properties that we need
+    reduced_data = []
+    for dapp in cleaned_payload_data:
+        cleaned_dapp = {
+            "name": dapp["name"],
+            "description": dapp["description"],
+            "url": dapp["url"],
+            "twitterHandle": dapp["twitterHandle"],
+            "blogLinks": dapp["blogLinks"],
+            "discord": dapp["socialLinks"]["discord"],
+            "facebook":  dapp["socialLinks"]["facebook"],
+            "instagram": dapp["socialLinks"]["instagram"],
+            "telegram": dapp["socialLinks"]["telegram"]
+        }
+        reduced_data.append(cleaned_dapp)
+
+    return reduced_data
+
+
+def load_data_from_json_to_db(session=db_session, json_path=dapps_json_path):
+    print("Loading data from JSON to DB")
+    # 1. Setup
+    # If the table doesn't exist, create it
+    # Base.metadata.create_all(session.bind) Dont need this - jacob b
+
+    # 2. Data Loading
+
+    # Read the JSON data
+    with open(json_path, "r") as file:
+        dapps_data = json.load(file)
+
+    # Loop through the JSON data and insert each entry into the database
+    for dapp in dapps_data:
+        print(f'adding {dapp["name"]}')
+        dapp_instance = Dapp(
+            description=dapp["description"],
+            name=dapp["name"],
+            url=dapp["url"],
+            twitter_handle=dapp["twitterHandle"],
+            blog_links=dapp["blogLinks"],
+            discord=dapp["discord"],
+            facebook=dapp["facebook"],
+            instagram=dapp["instagram"],
+            telegram=dapp["telegram"]
+        )
+        session.add(dapp_instance)
+
+    # 3. Finalization
+
+    # Commit the transactions
+    session.commit()
+
+    print("Finished loading data from JSON to DB")
+
+
+
+if __name__ == "__main__":
+
+    # create an ArgumentParser instance
+    parser = argparse.ArgumentParser(description='Run functions in the dapp_scraper module.')
+
+     # Add an argument for the load_data_from_json_to_db function
+    parser.add_argument('--load', action='store_true', help='Run the load_data_from_json_to_db function.')
+
+    # Parse the command line arguments
+    args = parser.parse_args()
+
+    # If the --load argument is present, call the load_data_from_json_to_db function
+    if args.load:
+        load_data_from_json_to_db()
diff --git a/scrape/models.py b/scrape/models.py
@@ -12,7 +12,7 @@
 from sqlalchemy.orm import (  # type: ignore
     scoped_session, sessionmaker, relationship,
     backref)
-from sqlalchemy.dialects.postgresql import UUID, JSONB
+from sqlalchemy.dialects.postgresql import UUID, JSONB, ARRAY, TEXT
 from sqlalchemy.ext.declarative import declarative_base  # type: ignore
 from sqlalchemy_utils import ChoiceType, Timestamp  # type: ignore
 
@@ -35,3 +35,19 @@ class ScrapedUrl(Base, Timestamp):  # type: ignore
     data = Column(JSONB, nullable=False)
 
     Index('scraped_url_lookup', url, unique=True)
+
+class Dapp(Base, Timestamp):
+    __tablename__ = 'dapp'
+
+    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+    description = Column(TEXT, nullable=False)
+    name = Column(String(255), nullable=False, unique=True)
+    url = Column(String(255), nullable=False)
+    twitter_handle = Column(String(255), nullable=True)
+    blog_links = Column(ARRAY(String(255)), nullable=True)
+    discord = Column(String(255), nullable=True)
+    facebook = Column(String(255), nullable=True)
+    instagram = Column(String(255), nullable=True)
+    telegram = Column(String(255), nullable=True)
+
+    Index('dapp_by_name', 'name', unique=True)