diff --git a/.metals/metals.lock.db b/.metals/metals.lock.db new file mode 100644 index 0000000..98e8dfd --- /dev/null +++ b/.metals/metals.lock.db @@ -0,0 +1,6 @@ +#FileLock +#Mon Oct 09 13:58:34 EDT 2023 +hostName=localhost +id=18b159714afc7992c11ec7d9147ac2956796e2e5dc2 +method=file +server=localhost\:55612 diff --git a/.scala-build/.bloop/project_bd2c96d2de-test.json b/.scala-build/.bloop/project_bd2c96d2de-test.json new file mode 100644 index 0000000..954293a --- /dev/null +++ b/.scala-build/.bloop/project_bd2c96d2de-test.json @@ -0,0 +1 @@ +{"version":"1.4.0","project":{"name":"project_bd2c96d2de-test","directory":"/Users/brianreicher/Documents/GitHub/turbine/.scala-build","workspaceDir":"/Users/brianreicher/Documents/GitHub/turbine","sources":[],"dependencies":["project_bd2c96d2de"],"classpath":["/Users/brianreicher/Library/Caches/Coursier/v1/https/repo1.maven.org/maven2/org/scala-lang/scala3-library_3/3.2.2/scala3-library_3-3.2.2.jar","/Users/brianreicher/Library/Caches/Coursier/v1/https/repo1.maven.org/maven2/org/scala-lang/scala-library/2.13.10/scala-library-2.13.10.jar","/Users/brianreicher/Documents/GitHub/turbine/.scala-build/project_bd2c96d2de/classes/main","/Users/brianreicher/Library/Caches/Coursier/v1/https/repo1.maven.org/maven2/com/sourcegraph/semanticdb-javac/0.7.4/semanticdb-javac-0.7.4.jar"],"out":"/Users/brianreicher/Documents/GitHub/turbine/.scala-build/.bloop/project_bd2c96d2de-test","classesDir":"/Users/brianreicher/Documents/GitHub/turbine/.scala-build/project_bd2c96d2de/classes/test","scala":{"organization":"org.scala-lang","name":"scala-compiler","version":"3.2.2","options":["-Xsemanticdb","-sourceroot","/Users/brianreicher/Documents/GitHub/turbine","-release","8"],"jars":["/Users/brianreicher/Library/Caches/Coursier/v1/https/repo1.maven.org/maven2/org/scala-lang/scala3-compiler_3/3.2.2/scala3-compiler_3-3.2.2.jar","/Users/brianreicher/Library/Caches/Coursier/v1/https/repo1.maven.org/maven2/org/scala-lang/scala3-interfaces/3.2.2/scala3-interfaces-3.2.2.jar","/Users/brianreicher/Library/Caches/Coursier/v1/https/repo1.maven.org/maven2/org/scala-lang/scala3-library_3/3.2.2/scala3-library_3-3.2.2.jar","/Users/brianreicher/Library/Caches/Coursier/v1/https/repo1.maven.org/maven2/org/scala-lang/tasty-core_3/3.2.2/tasty-core_3-3.2.2.jar","/Users/brianreicher/Library/Caches/Coursier/v1/https/repo1.maven.org/maven2/org/scala-lang/modules/scala-asm/9.3.0-scala-1/scala-asm-9.3.0-scala-1.jar","/Users/brianreicher/Library/Caches/Coursier/v1/https/repo1.maven.org/maven2/org/scala-sbt/compiler-interface/1.3.5/compiler-interface-1.3.5.jar","/Users/brianreicher/Library/Caches/Coursier/v1/https/repo1.maven.org/maven2/org/jline/jline-reader/3.19.0/jline-reader-3.19.0.jar","/Users/brianreicher/Library/Caches/Coursier/v1/https/repo1.maven.org/maven2/org/jline/jline-terminal/3.19.0/jline-terminal-3.19.0.jar","/Users/brianreicher/Library/Caches/Coursier/v1/https/repo1.maven.org/maven2/org/jline/jline-terminal-jna/3.19.0/jline-terminal-jna-3.19.0.jar","/Users/brianreicher/Library/Caches/Coursier/v1/https/repo1.maven.org/maven2/org/scala-lang/scala-library/2.13.10/scala-library-2.13.10.jar","/Users/brianreicher/Library/Caches/Coursier/v1/https/repo1.maven.org/maven2/com/google/protobuf/protobuf-java/3.7.0/protobuf-java-3.7.0.jar","/Users/brianreicher/Library/Caches/Coursier/v1/https/repo1.maven.org/maven2/org/scala-sbt/util-interface/1.3.0/util-interface-1.3.0.jar","/Users/brianreicher/Library/Caches/Coursier/v1/https/repo1.maven.org/maven2/net/java/dev/jna/jna/5.3.1/jna-5.3.1.jar"]},"java":{"options":["--release","8","-Xplugin:semanticdb -sourceroot:/Users/brianreicher/Documents/GitHub/turbine -targetroot:javac-classes-directory","-J--add-exports","-Jjdk.compiler/com.sun.tools.javac.api=ALL-UNNAMED","-J--add-exports","-Jjdk.compiler/com.sun.tools.javac.code=ALL-UNNAMED","-J--add-exports","-Jjdk.compiler/com.sun.tools.javac.model=ALL-UNNAMED","-J--add-exports","-Jjdk.compiler/com.sun.tools.javac.tree=ALL-UNNAMED","-J--add-exports","-Jjdk.compiler/com.sun.tools.javac.util=ALL-UNNAMED"]},"test":{"frameworks":[{"names":["com.novocode.junit.JUnitFramework"]},{"names":["org.scalatest.tools.Framework","org.scalatest.tools.ScalaTestFramework"]},{"names":["org.scalacheck.ScalaCheckFramework"]},{"names":["org.specs.runner.SpecsFramework","org.specs2.runner.Specs2Framework","org.specs2.runner.SpecsFramework"]},{"names":["utest.runner.Framework"]},{"names":["munit.Framework"]}],"options":{"excludes":[],"arguments":[]}},"platform":{"name":"jvm","config":{"options":[]},"mainClass":[]},"resolution":{"modules":[{"organization":"org.scala-lang","name":"scala3-library_3","version":"3.2.2","artifacts":[{"name":"scala3-library_3","path":"/Users/brianreicher/Library/Caches/Coursier/v1/https/repo1.maven.org/maven2/org/scala-lang/scala3-library_3/3.2.2/scala3-library_3-3.2.2.jar"},{"name":"scala3-library_3","classifier":"sources","path":"/Users/brianreicher/Library/Caches/Coursier/v1/https/repo1.maven.org/maven2/org/scala-lang/scala3-library_3/3.2.2/scala3-library_3-3.2.2-sources.jar"}]},{"organization":"org.scala-lang","name":"scala-library","version":"2.13.10","artifacts":[{"name":"scala-library","path":"/Users/brianreicher/Library/Caches/Coursier/v1/https/repo1.maven.org/maven2/org/scala-lang/scala-library/2.13.10/scala-library-2.13.10.jar"},{"name":"scala-library","classifier":"sources","path":"/Users/brianreicher/Library/Caches/Coursier/v1/https/repo1.maven.org/maven2/org/scala-lang/scala-library/2.13.10/scala-library-2.13.10-sources.jar"}]}]},"tags":["test"]}} \ No newline at end of file diff --git a/.scala-build/.bloop/project_bd2c96d2de.json b/.scala-build/.bloop/project_bd2c96d2de.json new file mode 100644 index 0000000..ecf1ec5 --- /dev/null +++ b/.scala-build/.bloop/project_bd2c96d2de.json @@ -0,0 +1 @@ +{"version":"1.4.0","project":{"name":"project_bd2c96d2de","directory":"/Users/brianreicher/Documents/GitHub/turbine/.scala-build","workspaceDir":"/Users/brianreicher/Documents/GitHub/turbine","sources":[],"dependencies":[],"classpath":["/Users/brianreicher/Library/Caches/Coursier/v1/https/repo1.maven.org/maven2/org/scala-lang/scala3-library_3/3.2.2/scala3-library_3-3.2.2.jar","/Users/brianreicher/Library/Caches/Coursier/v1/https/repo1.maven.org/maven2/org/scala-lang/scala-library/2.13.10/scala-library-2.13.10.jar","/Users/brianreicher/Library/Caches/Coursier/v1/https/repo1.maven.org/maven2/com/sourcegraph/semanticdb-javac/0.7.4/semanticdb-javac-0.7.4.jar"],"out":"/Users/brianreicher/Documents/GitHub/turbine/.scala-build/.bloop/project_bd2c96d2de","classesDir":"/Users/brianreicher/Documents/GitHub/turbine/.scala-build/project_bd2c96d2de/classes/main","scala":{"organization":"org.scala-lang","name":"scala-compiler","version":"3.2.2","options":["-Xsemanticdb","-sourceroot","/Users/brianreicher/Documents/GitHub/turbine","-release","8"],"jars":["/Users/brianreicher/Library/Caches/Coursier/v1/https/repo1.maven.org/maven2/org/scala-lang/scala3-compiler_3/3.2.2/scala3-compiler_3-3.2.2.jar","/Users/brianreicher/Library/Caches/Coursier/v1/https/repo1.maven.org/maven2/org/scala-lang/scala3-interfaces/3.2.2/scala3-interfaces-3.2.2.jar","/Users/brianreicher/Library/Caches/Coursier/v1/https/repo1.maven.org/maven2/org/scala-lang/scala3-library_3/3.2.2/scala3-library_3-3.2.2.jar","/Users/brianreicher/Library/Caches/Coursier/v1/https/repo1.maven.org/maven2/org/scala-lang/tasty-core_3/3.2.2/tasty-core_3-3.2.2.jar","/Users/brianreicher/Library/Caches/Coursier/v1/https/repo1.maven.org/maven2/org/scala-lang/modules/scala-asm/9.3.0-scala-1/scala-asm-9.3.0-scala-1.jar","/Users/brianreicher/Library/Caches/Coursier/v1/https/repo1.maven.org/maven2/org/scala-sbt/compiler-interface/1.3.5/compiler-interface-1.3.5.jar","/Users/brianreicher/Library/Caches/Coursier/v1/https/repo1.maven.org/maven2/org/jline/jline-reader/3.19.0/jline-reader-3.19.0.jar","/Users/brianreicher/Library/Caches/Coursier/v1/https/repo1.maven.org/maven2/org/jline/jline-terminal/3.19.0/jline-terminal-3.19.0.jar","/Users/brianreicher/Library/Caches/Coursier/v1/https/repo1.maven.org/maven2/org/jline/jline-terminal-jna/3.19.0/jline-terminal-jna-3.19.0.jar","/Users/brianreicher/Library/Caches/Coursier/v1/https/repo1.maven.org/maven2/org/scala-lang/scala-library/2.13.10/scala-library-2.13.10.jar","/Users/brianreicher/Library/Caches/Coursier/v1/https/repo1.maven.org/maven2/com/google/protobuf/protobuf-java/3.7.0/protobuf-java-3.7.0.jar","/Users/brianreicher/Library/Caches/Coursier/v1/https/repo1.maven.org/maven2/org/scala-sbt/util-interface/1.3.0/util-interface-1.3.0.jar","/Users/brianreicher/Library/Caches/Coursier/v1/https/repo1.maven.org/maven2/net/java/dev/jna/jna/5.3.1/jna-5.3.1.jar"]},"java":{"options":["--release","8","-Xplugin:semanticdb -sourceroot:/Users/brianreicher/Documents/GitHub/turbine -targetroot:javac-classes-directory","-J--add-exports","-Jjdk.compiler/com.sun.tools.javac.api=ALL-UNNAMED","-J--add-exports","-Jjdk.compiler/com.sun.tools.javac.code=ALL-UNNAMED","-J--add-exports","-Jjdk.compiler/com.sun.tools.javac.model=ALL-UNNAMED","-J--add-exports","-Jjdk.compiler/com.sun.tools.javac.tree=ALL-UNNAMED","-J--add-exports","-Jjdk.compiler/com.sun.tools.javac.util=ALL-UNNAMED"]},"platform":{"name":"jvm","config":{"options":[]},"mainClass":[]},"resolution":{"modules":[{"organization":"org.scala-lang","name":"scala3-library_3","version":"3.2.2","artifacts":[{"name":"scala3-library_3","path":"/Users/brianreicher/Library/Caches/Coursier/v1/https/repo1.maven.org/maven2/org/scala-lang/scala3-library_3/3.2.2/scala3-library_3-3.2.2.jar"},{"name":"scala3-library_3","classifier":"sources","path":"/Users/brianreicher/Library/Caches/Coursier/v1/https/repo1.maven.org/maven2/org/scala-lang/scala3-library_3/3.2.2/scala3-library_3-3.2.2-sources.jar"}]},{"organization":"org.scala-lang","name":"scala-library","version":"2.13.10","artifacts":[{"name":"scala-library","path":"/Users/brianreicher/Library/Caches/Coursier/v1/https/repo1.maven.org/maven2/org/scala-lang/scala-library/2.13.10/scala-library-2.13.10.jar"},{"name":"scala-library","classifier":"sources","path":"/Users/brianreicher/Library/Caches/Coursier/v1/https/repo1.maven.org/maven2/org/scala-lang/scala-library/2.13.10/scala-library-2.13.10-sources.jar"}]}]},"tags":["library"]}} \ No newline at end of file diff --git a/ingestion/.gitignore b/ingestion/.gitignore deleted file mode 100644 index bddd188..0000000 --- a/ingestion/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -/.bsp/ -target/ diff --git a/ingestion/build.sbt b/ingestion/build.sbt deleted file mode 100644 index d120c38..0000000 --- a/ingestion/build.sbt +++ /dev/null @@ -1,25 +0,0 @@ -import Dependencies._ - -ThisBuild / scalaVersion := "2.13.11" -ThisBuild / version := "0.1.0-SNAPSHOT" -ThisBuild / organization := "com.example" -ThisBuild / organizationName := "example" - -lazy val root = (project in file(".")) - .settings( - name := "turbine", - libraryDependencies += munit % Test - ) - -libraryDependencies += "org.apache.httpcomponents" % "httpclient" % "4.5.13" -libraryDependencies ++= Seq( - "org.apache.spark" %% "spark-core" % "3.0.3", - "org.apache.spark" %% "spark-sql" % "3.0.3" -) - -libraryDependencies ++= Seq( - "io.circe" %% "circe-core" % "0.14.1", // Core Circe functionality - "io.circe" %% "circe-generic" % "0.14.1", // For automatic derivation of codecs - "io.circe" %% "circe-parser" % "0.14.1" // For parsing JSON -) -// See https://www.scala-sbt.org/1.x/docs/Using-Sonatype.html for instructions on how to publish to Sonatype. diff --git a/ingestion/project/Dependencies.scala b/ingestion/project/Dependencies.scala deleted file mode 100644 index 1edb07a..0000000 --- a/ingestion/project/Dependencies.scala +++ /dev/null @@ -1,5 +0,0 @@ -import sbt._ - -object Dependencies { - lazy val munit = "org.scalameta" %% "munit" % "0.7.29" -} diff --git a/ingestion/project/build.properties b/ingestion/project/build.properties deleted file mode 100644 index 3040987..0000000 --- a/ingestion/project/build.properties +++ /dev/null @@ -1 +0,0 @@ -sbt.version=1.9.4 diff --git a/ingestion/src/main/scala/driver/Driver.scala b/ingestion/src/main/scala/driver/Driver.scala deleted file mode 100644 index 23d4c81..0000000 --- a/ingestion/src/main/scala/driver/Driver.scala +++ /dev/null @@ -1,38 +0,0 @@ -package driver - -import sys.process._ -import io.circe.parser - -object GitHubRepoDownloader { - def main(args: Array[String]): Unit = { - // error handling with arg parsing - if (args.length != 1) { - println("Usage: GitHubRepoDownloader ") - System.exit(1) - } - - val apiUrl = args(0) - val githubAccessToken = "PUT AT" - - // fetch user's repos using a get request - val repoInfo = scala.io.Source.fromURL(apiUrl, headers = Seq("Authorization" -> s"token $githubAccessToken")).mkString - - // parse JSON response - // get name and url - val repositories = io.circe.parser.parse(repoInfo) match { - case Right(json) => - json.asArray.getOrElse(Vector.empty).map { repo => - (repo.hcursor.downField("name").as[String].getOrElse(""), repo.hcursor.downField("clone_url").as[String].getOrElse("")) - } - case Left(error) => - println(s"Failed to parse JSON: $error") - Vector.empty - } - - // clone repos - repositories.foreach { case (repoName, cloneUrl) => - println(s"Cloning $repoName...") - s"git clone $cloneUrl".! - } - } -} diff --git a/ingestion/src/main/scala/driver/FileFilter.scala b/ingestion/src/main/scala/driver/FileFilter.scala deleted file mode 100644 index 9dcd237..0000000 --- a/ingestion/src/main/scala/driver/FileFilter.scala +++ /dev/null @@ -1,10 +0,0 @@ -package driver - -import org.apache.spark.sql.{Dataset, DataFrame} -import org.apache.spark.sql.functions.col - -object FileFilter { - def filterFilesByExtensions(repoFiles: Dataset[PreprocessGitHubRepos.RepoFile], extensions: Seq[String]): Dataset[PreprocessGitHubRepos.RepoFile] = { - repoFiles.filter(file => extensions.exists(file.fileName.endsWith)) - } -} diff --git a/ingestion/src/main/scala/driver/ProcessRepos.scala b/ingestion/src/main/scala/driver/ProcessRepos.scala deleted file mode 100644 index 344007c..0000000 --- a/ingestion/src/main/scala/driver/ProcessRepos.scala +++ /dev/null @@ -1,51 +0,0 @@ -package driver - - -import org.apache.spark.sql.{SparkSession, Dataset} -import org.apache.spark.sql.functions._ - -object ProcessRepos { - case class RepoFile(repoName: String, fileName: String, fileContent: String) - - def main(args: Array[String]): Unit = { - if (args.length != 1) { - println("Usage: PreprocessGitHubRepos ") - System.exit(1) - } - - val clonedReposPath = args(0) - - val spark = SparkSession.builder() - .appName("PreprocessGitHubRepos") - .getOrCreate() - - import spark.implicits._ - - // Read all files from cloned repositories - val repoFiles: Dataset[RepoFile] = spark.read - .textFile(s"$clonedReposPath/*/*") - .flatMap { line => - val lines = line.split("\n") - val repoName = lines.headOption.getOrElse("") - val fileName = lines.lift(1).getOrElse("") - val fileContent = lines.drop(2).mkString("\n") - if (repoName.nonEmpty && fileName.nonEmpty && fileContent.nonEmpty) - Some(RepoFile(repoName, fileName, fileContent)) - else - None - } - - // Define the list of file extensions to filter by - val allowedExtensions = Seq(".go", ".tsx", ".py", ".js", ".css", ".yaml", ".yml") - - // Use the FileFilter object to filter files by extensions - val filteredFiles = FileFilter.filterFilesByExtensions(repoFiles, allowedExtensions) - - - // Show the first few records as a demonstration - filteredFiles.show() - - // Stop the Spark session when done - spark.stop() - } -} diff --git a/ingestion/src/test/scala/example/HelloSpec.scala b/ingestion/src/test/scala/example/HelloSpec.scala deleted file mode 100644 index d57b5ed..0000000 --- a/ingestion/src/test/scala/example/HelloSpec.scala +++ /dev/null @@ -1,7 +0,0 @@ -package example - -class HelloSpec extends munit.FunSuite { - test("say hello") { - assertEquals(Hello.greeting, "hello") - } -} diff --git a/tokenizer/Cargo.toml b/tokenizer/Cargo.toml new file mode 100644 index 0000000..eb1d7fa --- /dev/null +++ b/tokenizer/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "tokenizer" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +pinenut = "0.1.3" +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" \ No newline at end of file diff --git a/tokenizer/src/main.rs b/tokenizer/src/main.rs new file mode 100644 index 0000000..6f818c8 --- /dev/null +++ b/tokenizer/src/main.rs @@ -0,0 +1,102 @@ +use pinenut::{Client, Index, models::Vector}; +use serde_json::{Value, from_str}; + + +pub struct PinceoneDriver { + client: Client, + index: Index, +} + +impl PinceoneDriver { + pub fn new(host: &str, port: u16, db_name: &str) -> Self { + let client = Client::with_uri_str(&format!("mongodb://{}:{}", host, port)) + .expect("Failed to create PineconeDB client"); + let client = Client::new(env!("PINECONE_API_KEY"), env!("PINECONE_ENV")).await.unwrap(); + let mut index = client.index(env!("PINECONE_INDEX_NAME")); + let _ = index.describe().await.unwrap(); + + PinceoneDriver { client, index } + } + + pub async fn flush_collection(&self, collection_name: &str) -> Result<(), mongodb::error::Error> { + let collection = self.client.database(&self.db_name).collection(collection_name); + let result = collection.delete_many(Document::new()).await?; + println!("Deleted {} documents from {}", result.deleted_count, collection_name); + Ok(()) + } + + pub async fn remove_collection(&self, collection_name: &str) -> Result<(), mongodb::error::Error> { + let collection = self.client.database(&self.db_name).collection(collection_name); + collection.drop(None).await?; + println!("Dropped {} from {}", collection_name, self.db_name); + Ok(()) + } + + pub async fn create_collection(&self, collection_name: &str) -> Result<(), mongodb::error::Error> { + self.client.database(&self.db_name).create_collection(collection_name, None).await?; + println!("Created collection {} in {}", collection_name, self.db_name); + Ok(()) + } + + pub async fn collection_size(&self, collection_name: &str) -> Result { + let collection = self.client.database(&self.db_name).collection(collection_name); + let count = collection.count_documents(Document::new(), None).await?; + Ok(count) + } + + pub async fn insert_data(&self, collection_name: &str, json_file: &str, clear: bool) -> Result<(), mongodb::error::Error> { + let collection = self.client.database(&self.db_name).collection(collection_name); + + if clear { + self.flush_collection(collection_name).await?; + } + + let file_contents = std::fs::read_to_string(json_file)?; + let data: Vec = from_str(&file_contents)?; + + let mut documents = Vec::new(); + for document in data { + if let Value::Object(map) = document { + let bson_doc = Document::try_from(map)?; + documents.push(bson_doc); + } + } + + collection.insert_many(documents, None).await?; + println!("Inserted {} documents into {} in the {}", documents.len(), collection_name, self.db_name); + Ok(()) + } + + pub async fn search_query(&self, collection_name: &str, qu: Document, proj: Document, lim: i64, show: bool) -> Result, mongodb::error::Error> { + let collection = self.client.database(&self.db_name).collection(collection_name); + let cursor = collection.find(qu, None).projection(proj).limit(lim); + let mut result = Vec::new(); + + if show { + while let Some(doc) = cursor.next().await { + match doc { + Ok(document) => { + println!("{:?}", document); + result.push(document); + } + Err(e) => { + eprintln!("Error: {:?}", e); + } + } + } + } else { + while let Some(doc) = cursor.next().await { + match doc { + Ok(document) => { + result.push(document); + } + Err(e) => { + eprintln!("Error: {:?}", e); + } + } + } + } + + Ok(result) + } +}