Moved pipeline code into its own folder

basedrhys · Dec 14, 2019 · e643e5f · e643e5f
1 parent 85c9773
commit e643e5f
Show file tree

Hide file tree

Showing 61 changed files with 1,285 additions and 1,261 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,12 +1,12 @@
 # Models
-models/*
+**/models/*
 
 # Datasets
-java_files/*
-text_arff/*
+**/java_files/*
+**/text_arff/*
 
 # Weka files
-weka_files/*
+**/weka_files/*
 
 # Keep readme files
 !**/README.md

diff --git a/JavaExtractor/JPredict/target/JavaExtractor-0.0.1-SNAPSHOT.jar b/JavaExtractor/JPredict/target/JavaExtractor-0.0.1-SNAPSHOT.jar
diff --git a/JavaExtractor/JPredict/target/classes/JavaExtractor/App.class b/JavaExtractor/JPredict/target/classes/JavaExtractor/App.class
diff --git a/JavaExtractor/JPredict/target/classes/JavaExtractor/Common/CommandLineValues.class b/JavaExtractor/JPredict/target/classes/JavaExtractor/Common/CommandLineValues.class
diff --git a/JavaExtractor/JPredict/target/classes/JavaExtractor/Common/Common$1.class b/JavaExtractor/JPredict/target/classes/JavaExtractor/Common/Common$1.class
diff --git a/JavaExtractor/JPredict/target/classes/JavaExtractor/Common/Common$2.class b/JavaExtractor/JPredict/target/classes/JavaExtractor/Common/Common$2.class
diff --git a/JavaExtractor/JPredict/target/classes/JavaExtractor/Common/Common$3.class b/JavaExtractor/JPredict/target/classes/JavaExtractor/Common/Common$3.class
diff --git a/JavaExtractor/JPredict/target/classes/JavaExtractor/Common/Common.class b/JavaExtractor/JPredict/target/classes/JavaExtractor/Common/Common.class
diff --git a/JavaExtractor/JPredict/target/classes/JavaExtractor/Common/MethodContent.class b/JavaExtractor/JPredict/target/classes/JavaExtractor/Common/MethodContent.class
diff --git a/JavaExtractor/JPredict/target/classes/JavaExtractor/ExtractFeaturesTask.class b/JavaExtractor/JPredict/target/classes/JavaExtractor/ExtractFeaturesTask.class
diff --git a/JavaExtractor/JPredict/target/classes/JavaExtractor/FeatureExtractor.class b/JavaExtractor/JPredict/target/classes/JavaExtractor/FeatureExtractor.class
diff --git a/JavaExtractor/JPredict/target/classes/JavaExtractor/FeaturesEntities/ProgramFeatures.class b/JavaExtractor/JPredict/target/classes/JavaExtractor/FeaturesEntities/ProgramFeatures.class
diff --git a/JavaExtractor/JPredict/target/classes/JavaExtractor/FeaturesEntities/ProgramNode.class b/JavaExtractor/JPredict/target/classes/JavaExtractor/FeaturesEntities/ProgramNode.class
diff --git a/JavaExtractor/JPredict/target/classes/JavaExtractor/FeaturesEntities/ProgramRelation.class b/JavaExtractor/JPredict/target/classes/JavaExtractor/FeaturesEntities/ProgramRelation.class
diff --git a/JavaExtractor/JPredict/target/classes/JavaExtractor/FeaturesEntities/Property.class b/JavaExtractor/JPredict/target/classes/JavaExtractor/FeaturesEntities/Property.class
diff --git a/JavaExtractor/JPredict/target/classes/JavaExtractor/Visitors/FunctionVisitor.class b/JavaExtractor/JPredict/target/classes/JavaExtractor/Visitors/FunctionVisitor.class
diff --git a/JavaExtractor/JPredict/target/classes/JavaExtractor/Visitors/LeavesCollectorVisitor.class b/JavaExtractor/JPredict/target/classes/JavaExtractor/Visitors/LeavesCollectorVisitor.class
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# Obsucated code2vec: Improving Generalisation by Hiding Information
+# Obsucated code2vec: Reducing Model Bias by Hiding Information
 
 ![Overall project view](img/overall.png)
 
@@ -11,18 +11,37 @@ All of the model-related code (`common.py`, `model.py`, `PathContextReader.py`)
 All models/datasets are on the paper google drive folder
 https://drive.google.com/drive/u/1/folders/1CXgSXKf292BTlryASui2kBvYvJSvFnWN
 
+## Requirements
+- Java 8+
+- Python 3
+
+## Usage - Obfuscator
+These steps should all be run from within the `java-obfuscator/` directory.
+1. Locate a folder of `.java` files (e.g., from the [code2seq](https://github.com/tech-srl/code2seq) repository)
+2. Alter the input and output directories in `obfs-script.sh`, as well as the number of threads of your machine. If you're running this on a particularly large folder (e.g., millions of files) then you may need to increase the `NUM_PARTITIONS` to 3 or 4, otherwise memory issues can occur, grinding the obfuscator to a near halt.
+3. Run `obfs-script.sh` i.e. `$ source obfs-script.sh`
+
+This will result in a new obfuscated folder of `.java` files, that can be used to train a new obfuscated code2vec model (or any model that performs learning from source code for that matter).
+
 ## Usage - Dataset Pipeline
 
 ![Dataset Pipeline View](img/pipeline.png)
 
+These steps will convert a dataset of `.java` files into a numerical form (`.arff` by default), that can then be used with any standard WEKA classifier.
+
+These steps should all be run from within the `pipeline/` directory of this repository.
 To run the dataset pipeline and create class-level embeddings for a dataset of Java files:
+1. `cd pipeline`
+2. `pip install -r requirements.txt`
 1. Download a `.java` dataset (from the datasets supplied or your own) and put in the `java_files/` directory
 2. Download a code2vec model checkpoint and put the checkpoint folder in the `models/` directory
-3. Change the paths and definitions in `model_defs.py` and number of models in `create_datasets.sh` to match your setup
-4. Run `create_datasets.sh`. This will loop through each model and create class-level embeddings for the supplied datasets. The resulting datasets will be in `.arff` format in the `weka_files/` folder
+3. Change the paths and definitions in `model_defs.py` and number of models in `scripts/create_datasets.sh` to match your setup
+4. Run `create_datasets.sh` (`source scripts/create_datasets.sh`). This will loop through each model and create class-level embeddings for the supplied datasets. The resulting datasets will be in `.arff` format in the `weka_files/` folder. 
+
+You can now perform class-level classification on the dataset using any off-the-shelf classifier.
 
 ### Config
-By default the pipeline will use the full range of values for each parameter, which creates a huge number of resulting `.arff` datasets (>1000). To reduce the number of these, remove (or comment out) some of the items in the arrays in `reduction_methods.py` and `selection_methods.py` (at the end of the file). Our experiments showed that the `SelectAll` selection method and `NoReduction` reduction method performed best in most cases so you may want to keep only these.
+By default the pipeline will use the full range of values for each parameter, which creates a huge number of resulting `.arff` datasets (>1000). To reduce the number of these, remove (or comment out) some of the items in the arrays in `reduction_methods.py` and `selection_methods.py` (at the end of the file). Our experiments showed that the `SelectAll` selection method and `NoReduction` reduction method performed best in most cases so you may want to just keep these.
 
 ## Datasets
 
@@ -52,6 +71,8 @@ The `.java` files are all [available for download](https://drive.google.com/driv
 
 13 categories, 1062 instances
 
+This dataset was collected using the [github-scraper](https://github.com/basedrhys/github-scraper) python tool, which makes it easy to download specific types of files from github repos (`.java` files in this case).
+
 [Google Drive Link](https://drive.google.com/open?id=1IC0Nxeew73p9yvfhKcKH-6mxW8nHGyfn)
 
 [Embedding Visualisation](http://projector.tensorflow.org/?config=https://gist.githubusercontent.com/basedrhys/36fcd8653f2d759a8f1b03e56502a58e/raw/7d2ddef1c219d4fad7a49cc2c978d1ff4e25e5f1/author_config.json)

diff --git a/java-tool.jar b/java-tool.jar
diff --git a/ClassPreprocessor.py → pipeline/ClassPreprocessor.py b/ClassPreprocessor.py → pipeline/ClassPreprocessor.py
diff --git a/JavaExtractor/JPredict/.classpath → pipeline/JavaExtractor/JPredict/.classpath b/JavaExtractor/JPredict/.classpath → pipeline/JavaExtractor/JPredict/.classpath
diff --git a/JavaExtractor/JPredict/.gitignore → pipeline/JavaExtractor/JPredict/.gitignore b/JavaExtractor/JPredict/.gitignore → pipeline/JavaExtractor/JPredict/.gitignore
diff --git a/JavaExtractor/JPredict/.project → pipeline/JavaExtractor/JPredict/.project b/JavaExtractor/JPredict/.project → pipeline/JavaExtractor/JPredict/.project
@@ -1,23 +1,23 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<projectDescription>
-	<name>JavaExtractor</name>
-	<comment></comment>
-	<projects>
-	</projects>
-	<buildSpec>
-		<buildCommand>
-			<name>org.eclipse.jdt.core.javabuilder</name>
-			<arguments>
-			</arguments>
-		</buildCommand>
-		<buildCommand>
-			<name>org.eclipse.m2e.core.maven2Builder</name>
-			<arguments>
-			</arguments>
-		</buildCommand>
-	</buildSpec>
-	<natures>
-		<nature>org.eclipse.jdt.core.javanature</nature>
-		<nature>org.eclipse.m2e.core.maven2Nature</nature>
-	</natures>
-</projectDescription>
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>JavaExtractor</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.eclipse.jdt.core.javabuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+		<buildCommand>
+			<name>org.eclipse.m2e.core.maven2Builder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.eclipse.jdt.core.javanature</nature>
+		<nature>org.eclipse.m2e.core.maven2Nature</nature>
+	</natures>
+</projectDescription>
diff --git a/...settings/org.eclipse.core.resources.prefs → ...settings/org.eclipse.core.resources.prefs b/...settings/org.eclipse.core.resources.prefs → ...settings/org.eclipse.core.resources.prefs
@@ -1,3 +1,3 @@
-eclipse.preferences.version=1
-encoding//src/main/java=UTF-8
-encoding/<project>=UTF-8
+eclipse.preferences.version=1
+encoding//src/main/java=UTF-8
+encoding/<project>=UTF-8
diff --git a/.../.settings/org.eclipse.jdt.apt.core.prefs → .../.settings/org.eclipse.jdt.apt.core.prefs b/.../.settings/org.eclipse.jdt.apt.core.prefs → .../.settings/org.eclipse.jdt.apt.core.prefs
diff --git a/...dict/.settings/org.eclipse.jdt.core.prefs → ...dict/.settings/org.eclipse.jdt.core.prefs b/...dict/.settings/org.eclipse.jdt.core.prefs → ...dict/.settings/org.eclipse.jdt.core.prefs
@@ -1,16 +1,16 @@
-eclipse.preferences.version=1
-org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
-org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8
-org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
-org.eclipse.jdt.core.compiler.compliance=1.8
-org.eclipse.jdt.core.compiler.debug.lineNumber=generate
-org.eclipse.jdt.core.compiler.debug.localVariable=generate
-org.eclipse.jdt.core.compiler.debug.sourceFile=generate
-org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
-org.eclipse.jdt.core.compiler.problem.enablePreviewFeatures=disabled
-org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
-org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
-org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=ignore
-org.eclipse.jdt.core.compiler.processAnnotations=disabled
-org.eclipse.jdt.core.compiler.release=disabled
-org.eclipse.jdt.core.compiler.source=1.8
+eclipse.preferences.version=1
+org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
+org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8
+org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
+org.eclipse.jdt.core.compiler.compliance=1.8
+org.eclipse.jdt.core.compiler.debug.lineNumber=generate
+org.eclipse.jdt.core.compiler.debug.localVariable=generate
+org.eclipse.jdt.core.compiler.debug.sourceFile=generate
+org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
+org.eclipse.jdt.core.compiler.problem.enablePreviewFeatures=disabled
+org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
+org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
+org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=ignore
+org.eclipse.jdt.core.compiler.processAnnotations=disabled
+org.eclipse.jdt.core.compiler.release=disabled
+org.eclipse.jdt.core.compiler.source=1.8
diff --git a/...dict/.settings/org.eclipse.m2e.core.prefs → ...dict/.settings/org.eclipse.m2e.core.prefs b/...dict/.settings/org.eclipse.m2e.core.prefs → ...dict/.settings/org.eclipse.m2e.core.prefs
@@ -1,4 +1,4 @@
-activeProfiles=
-eclipse.preferences.version=1
-resolveWorkspaceProjects=true
-version=1
+activeProfiles=
+eclipse.preferences.version=1
+resolveWorkspaceProjects=true
+version=1
diff --git a/JavaExtractor/JPredict/pom.xml → pipeline/JavaExtractor/JPredict/pom.xml b/JavaExtractor/JPredict/pom.xml → pipeline/JavaExtractor/JPredict/pom.xml
@@ -1,75 +1,75 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-  <groupId>JavaExtractor</groupId>
-  <artifactId>JavaExtractor</artifactId>
-  <name>JPredict</name>
-  <version>0.0.1-SNAPSHOT</version>
-  <url>http://maven.apache.org</url>
-  <build>
-    <plugins>
-      <plugin>
-        <artifactId>maven-compiler-plugin</artifactId>
-        <version>3.2</version>
-        <configuration>
-          <source>1.8</source>
-          <target>1.8</target>
-          <excludes>
-            <exclude>Test.java</exclude>
-          </excludes>
-        </configuration>
-      </plugin>
-      <plugin>
-        <artifactId>maven-shade-plugin</artifactId>
-        <version>2.1</version>
-        <executions>
-          <execution>
-            <phase>package</phase>
-            <goals>
-              <goal>shade</goal>
-            </goals>
-            <configuration>
-              <transformers>
-                <transformer
-                  implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
-                </transformer>
-              </transformers>
-            </configuration>
-          </execution>
-        </executions>
-      </plugin>
-    </plugins>
-  </build>
-  <dependencies>
-    <dependency>
-      <groupId>com.github.javaparser</groupId>
-      <artifactId>javaparser-core</artifactId>
-      <version>3.0.0-alpha.4</version>
-    </dependency>
-    <dependency>
-      <groupId>commons-io</groupId>
-      <artifactId>commons-io</artifactId>
-      <version>1.3.2</version>
-      <scope>compile</scope>
-    </dependency>
-    <dependency>
-      <groupId>com.fasterxml.jackson.core</groupId>
-      <artifactId>jackson-databind</artifactId>
-      <version>2.9.10.1</version>
-    </dependency>
-    <dependency>
-      <groupId>args4j</groupId>
-      <artifactId>args4j</artifactId>
-      <version>2.33</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.commons</groupId>
-      <artifactId>commons-lang3</artifactId>
-      <version>3.5</version>
-    </dependency>
-  </dependencies>
-  <properties>
-    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
-  </properties>
-</project>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <groupId>JavaExtractor</groupId>
+  <artifactId>JavaExtractor</artifactId>
+  <name>JPredict</name>
+  <version>0.0.1-SNAPSHOT</version>
+  <url>http://maven.apache.org</url>
+  <build>
+    <plugins>
+      <plugin>
+        <artifactId>maven-compiler-plugin</artifactId>
+        <version>3.2</version>
+        <configuration>
+          <source>1.8</source>
+          <target>1.8</target>
+          <excludes>
+            <exclude>Test.java</exclude>
+          </excludes>
+        </configuration>
+      </plugin>
+      <plugin>
+        <artifactId>maven-shade-plugin</artifactId>
+        <version>2.1</version>
+        <executions>
+          <execution>
+            <phase>package</phase>
+            <goals>
+              <goal>shade</goal>
+            </goals>
+            <configuration>
+              <transformers>
+                <transformer
+                  implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
+                </transformer>
+              </transformers>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
+  <dependencies>
+    <dependency>
+      <groupId>com.github.javaparser</groupId>
+      <artifactId>javaparser-core</artifactId>
+      <version>3.0.0-alpha.4</version>
+    </dependency>
+    <dependency>
+      <groupId>commons-io</groupId>
+      <artifactId>commons-io</artifactId>
+      <version>1.3.2</version>
+      <scope>compile</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.fasterxml.jackson.core</groupId>
+      <artifactId>jackson-databind</artifactId>
+      <version>2.9.10.1</version>
+    </dependency>
+    <dependency>
+      <groupId>args4j</groupId>
+      <artifactId>args4j</artifactId>
+      <version>2.33</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.commons</groupId>
+      <artifactId>commons-lang3</artifactId>
+      <version>3.5</version>
+    </dependency>
+  </dependencies>
+  <properties>
+    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+  </properties>
+</project>
+