yieldprotocol · jacobryas4 · Sep 22, 2023 · Aug 31, 2023 · Aug 31, 2023 · Sep 15, 2023
diff --git a/config.py b/config.py
@@ -19,6 +19,14 @@
     text_key="content",
     extra_keys=["url"],
 )
+
+dapps_index = dict(
+    type="index.weaviate.WeaviateIndex",
+    index_name="ThirdPartyDapps",
+    text_key="description",
+    extra_keys=["url","name"],
+)
+
 api_docs_index = dict(
     type="index.weaviate.WeaviateIndex",
     index_name="APIDocsV1",

diff --git a/index/dapps.py b/index/dapps.py
@@ -0,0 +1,106 @@
+# to build the index for dapps, first scrape them using the scraper
+# then run: python3 -c "from index.dapps import backfill; backfill()"
+
+
+from langchain.docstore.document import Document
+from .weaviate import get_client
+import json
+
+INDEX_NAME = "ThirdPartyDapps"
+INDEX_DESCRIPTION = "Index of Third party dapps"
+DAPP_DESCRIPTION = "description"
+DAPP_NAME = "name"
+DAPP_URL = "url"
+
+def delete_schema() -> None:
+    try: 
+        client = get_client()
+        client.schema.delete_class(INDEX_NAME)
+    except Exception as e: 
+        print(f"Error deleting schmea: {str(e)}")
+
+def create_schema(delete_first: bool = False) -> None:
+    try: 
+        client = get_client()
+        if delete_first: 
+            delete_schema() 
+        client.schema.get()
+        schema = {
+            "classes": [
+                {
+                    "class": INDEX_NAME,
+                    "description": INDEX_DESCRIPTION,
+                    "vectorizer": "text2vec-openai",
+                    "moduleConfig": {
+                        "text2vec-openai": {
+                            "model": "ada",
+                            "modelVersion": "002",
+                            "type": "text"
+                        }
+                    },
+                    "properties": [
+                        {"name": DAPP_NAME, "dataType": ["text"]},
+                        {"name": DAPP_DESCRIPTION, "dataType": ["text"]},
+                        {"name": DAPP_URL, "dataType": ["text"]},
+                        {
+                            "name": "twitterHandle",
+                            "dataType": ["text"],
+                            "description": "The Twitter handle of the Dapp"
+                        },
+                        {
+                            "name": "blogLinks",
+                            "dataType": ["text[]"],
+                            "description": "Links to the blog posts related to the Dapp"
+                        },
+                        {
+                            "name": "discord",
+                            "dataType": ["text"],
+                            "description": "The Discord server link of the Dapp"
+                        },
+                        {
+                            "name": "facebook",
+                            "dataType": ["text"],
+                            "description": "The Facebook page link of the Dapp"
+                        },
+                        {
+                            "name": "instagram",
+                            "dataType": ["text"],
+                            "description": "The Instagram profile link of the Dapp"
+                        },
+                        {
+                            "name": "telegram",
+                            "dataType": ["text"],
+                            "description": "The Telegram channel link of the Dapp"
+                        }
+                    ]
+                }
+
+            ]
+        }
+        client.schema.create(schema)
+    except Exception as e:
+        print(f"Error creating schema: {str(e)}")
+
+def backfill():
+    try: 
+        from langchain.vectorstores import Weaviate
+
+        with open('./knowledge_base/dapps_ranked.json') as f: 
+            dapp_list = json.load(f)
+
+        # Extract the 'name' field from each dapp and store it in the 'documents' list
+        documents = [d.pop("name") for d in dapp_list]
+
+        # Use the remaining fields in each dapp to populate the 'metadatas' list
+        # is this the best 'metadatas' to use?
+        metadatas = dapp_list
+
+        create_schema(delete_first=True)
+
+        client = get_client()
+        w = Weaviate(client, INDEX_NAME, DAPP_NAME) # is this a proper 3rd argument?
+        w.add_texts(documents, metadatas)
+    except Exception as e: 
+        print(f"Error during backfill in dapps.py {str(e)}")
+
+
diff --git a/scrape/dapp_scraper.py b/scrape/dapp_scraper.py
@@ -0,0 +1,193 @@
+import requests
+import json
+import os
+from typing import List
+
+BROWSERLESS_API_KEY = os.getenv('BROWSERLESS_API_KEY', '')
+SCRAPE_API_URL = f'https://chrome.browserless.io/scrape?token={BROWSERLESS_API_KEY}'
+
+# scrape a URL for IPFS links, return 
+def get_ipfs_links_from_url(url: str) -> List[str]:
+
+    # specify what elements to return - in this case IFPS links
+    payload = json.dumps({
+        "url": url,
+        "elements": [
+            {
+                "selector": "a[href*='ipfs.io']",
+            },
+        ],
+    })
+
+    # make the request
+    r = requests.post(SCRAPE_API_URL, headers={
+                'Cache-Control': 'no-cache',
+                'Content-Type': 'application/json',
+            }, data=payload)
+
+    # response text
+    response = r.text
+
+    # Parse the JSON string into a dictionary
+    data = json.loads(response)
+
+    # Access the items in the 'results' key
+    results = data['data'][0]['results']
+
+    # instantiate array to hold cleaned URLs
+    cleaned_ipfs_urls = []
+
+    # loop through response data, build array of just the IPFS links
+    for result in results:
+        href_value = None
+        for attribute in result["attributes"]:
+            if attribute["name"] == "href":
+                href_value = attribute["value"]
+                break
+
+        if href_value: 
+            cleaned_ipfs_urls.append(href_value)
+
+    # return links arr
+    return cleaned_ipfs_urls
+
+
+def scrape_ipfs_links(url: str) -> str:
+
+    payload = json.dumps({
+        "url": url,
+        "elements": [
+            {
+                "selector": "a[href*='ipfs.io']",
+            },
+        ],
+    })
+
+    r = requests.post(SCRAPE_API_URL, headers={
+                'Cache-Control': 'no-cache',
+                'Content-Type': 'application/json',
+            }, data=payload)
+
+    # Assuming the value from r.text is stored in the 'response' variable
+    response = r.text
+
+    # Parse the JSON string into a dictionary
+    data = json.loads(response)
+
+    # Access the items in the 'results' key from the browserless response
+    results = data['data'][0]['results']
+    cleaned_ipfs_urls = []
+
+    for result in results:
+        href_value = None
+        for attribute in result["attributes"]:
+            if attribute["name"] == "href":
+                href_value = attribute["value"]
+                break
+
+        if href_value: 
+            cleaned_ipfs_urls.append(href_value)
+
+    headers = {
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+        'Accept-Language': 'en-US,en;q=0.9',
+        'Cache-Control': 'no-cache',
+        'Pragma': 'no-cache',
+        'Sec-Ch-Ua': '"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"',
+        'Sec-Ch-Ua-Mobile': '?0',
+        'Sec-Ch-Ua-Platform': '"macOS"',
+        'Sec-Fetch-Dest': 'document',
+        'Sec-Fetch-Mode': 'navigate',
+        'Sec-Fetch-Site': 'cross-site',
+        'Sec-Fetch-User': '?1',
+        'Upgrade-Insecure-Requests': '1',
+        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
+    }
+
+    responses = []
+
+    # here we take the scraped CIDs and pull info for each dapp
+    # from cloudflare's public IPFS gateway
+    with requests.Session() as session:
+        for url in cleaned_ipfs_urls:
+            CID = get_url_suffix(url)
+            IPFS_URL = f"https://cloudflare-ipfs.com/ipfs/{CID}"
+            try:
+                response = session.get(IPFS_URL, headers=headers, timeout=30)
+                if response.status_code == 200:
+                    # process the response
+                    responses.append(response.content)
+                    pass
+                else:
+                    print(f"Failed to retrieve {url}. Status code: {response.status_code}")
+            except requests.RequestException as e:
+                print(f"Error fetching {url}: {e}")
+
+    # convert bytes objects to strings and load them as JSON
+    responses_str = [json.loads(response.decode()) for response in responses]
+
+    # Save the responses array to a new json file called 'dapp-list.json'
+    with open('dapp-list.json', 'w') as f:
+        json.dump(clean_payload_data(responses_str), f, ensure_ascii=False)
+
+
+
+# a util function that, in this case, will get us the IPFS CID
+def get_url_suffix(url: str) -> str:
+    return url.rsplit('/', 1)[-1]
+
+# Function to further clean the original JSON data by focusing only on the 'payload' property of 'msg'
+def clean_payload_data(original_data):
+    # Extract and parse the 'msg' fields, then extract the 'payload' property
+    cleaned_payload_data = [json.loads(item.get('msg', '{}')).get('payload', {}) for item in original_data]
+
+    # reduce each obj to just a few properties that we need
+    reduced_data = []
+    for dapp in cleaned_payload_data:
+        cleaned_dapp = {
+            "name": dapp["name"],
+            "description": dapp["description"],
+            "url": dapp["url"],
+            "twitterHandle": dapp["twitterHandle"],
+            "blogLinks": dapp["blogLinks"],
+            "discord": dapp["socialLinks"]["discord"],
+            "facebook":  dapp["socialLinks"]["facebook"],
+            "instagram": dapp["socialLinks"]["instagram"],
+            "telegram": dapp["socialLinks"]["telegram"]
+        }
+        reduced_data.append(cleaned_dapp)
+
+    return reduced_data
+
+
+def load_data_from_json_to_db(session, json_path):
+    # 1. Setup
+    # If the table doesn't exist, create it
+    # Base.metadata.create_all(session.bind) Dont need this - jacob b
+
+    # 2. Data Loading
+
+    # Read the JSON data
+    with open(json_path, "r") as file:
+        dapps_data = json.load(file)
+
+    # Loop through the JSON data and insert each entry into the database
+    for dapp in dapps_data:
+        dapp_instance = DApp(
+            description=dapp["description"],
+            name=dapp["name"],
+            url=dapp["url"],
+            twitter_handle=dapp["twitterHandle"],
+            blog_links=dapp["blogLinks"],
+            discord=dapp["discord"],
+            facebook=dapp["facebook"],
+            instagram=dapp["instagram"],
+            telegram=dapp["telegram"]
+        )
+        session.add(dapp_instance)
+
+    # 3. Finalization
+
+    # Commit the transactions
+    session.commit()
+
diff --git a/scrape/models.py b/scrape/models.py
@@ -12,7 +12,7 @@
 from sqlalchemy.orm import (  # type: ignore
     scoped_session, sessionmaker, relationship,
     backref)
-from sqlalchemy.dialects.postgresql import UUID, JSONB
+from sqlalchemy.dialects.postgresql import UUID, JSONB, ARRAY, TEXT
 from sqlalchemy.ext.declarative import declarative_base  # type: ignore
 from sqlalchemy_utils import ChoiceType, Timestamp  # type: ignore
 
@@ -35,3 +35,19 @@ class ScrapedUrl(Base, Timestamp):  # type: ignore
     data = Column(JSONB, nullable=False)
 
     Index('scraped_url_lookup', url, unique=True)
+
+class Dapp(Base, Timestamp):
+    __tablename__ = 'dapp'
+
+    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+    description = Column(TEXT, nullable=False)
+    name = Column(String(255), nullable=False, unique=True)
+    url = Column(String(255), nullable=False)
+    twitter_handle = Column(String(255), nullable=True)
+    blog_links = Column(ARRAY(String(255)), nullable=True)
+    discord = Column(String(255), nullable=True)
+    facebook = Column(String(255), nullable=True)
+    instagram = Column(String(255), nullable=True)
+    telegram = Column(String(255), nullable=True)
+
+    Index('dapp_by_name', 'name', unique=True)
diff --git a/tools/index_answer.py b/tools/index_answer.py
@@ -12,7 +12,7 @@
 
 TEMPLATE = '''You are a web3 assistant. You help users with answering web3-related questions. Your responses should sound natural, helpful, cheerful, and engaging, and you should use easy to understand language with explanations for jargon.
 
-Information to help complete your task is below. Only use information below to answer the question, and create a final answer with inline citations linked to the provided source URLs. If you don't know the answer, just say that you don't know. Don't try to make up an answer. ALWAYS return a "SOURCES" part in your answer corresponding to the numbered inline citations.
+Information to help complete your task is below. Only use information below to answer the question, and create a final answer with inline citations linked to the provided source URLs. If you don't know the answer, just say that you don't know. Don't try to make up an answer. ALWAYS return a "SOURCES" part in your answer corresponding to the numbered inline citations. ALWAYS provide a link to each citation in SOURCES.
 ---
 {task_info}
 ---

diff --git a/tools/index_widget.py b/tools/index_widget.py
@@ -355,13 +355,16 @@ def fn(token_handler):
 @error_wrap
 def fetch_scraped_sites(query: str) -> Callable:
     def fn(token_handler):
-        scraped_sites_index = config.initialize(config.scraped_sites_index)
+        # below is the old index used previously for scraped sites - do we still use this?
+        # if so, should we make a different function for the new dapps_index?
+        # scraped_sites_index = config.initialize(config.scraped_sites_index)
+        dapps_index = config.initialize(config.dapps_index)
         tool = dict(
             type="tools.index_answer.IndexAnswerTool",
             _streaming=True,
             name="ScrapedSitesIndexAnswer",
             content_description="",  # not used
-            index=scraped_sites_index,
+            index=dapps_index, 
             top_k=3,
             source_key="url",
         )