diff --git a/.gitignore b/.gitignore index cb67ab2..2e90d6f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,12 +1,12 @@ # Models -models/* +**/models/* # Datasets -java_files/* -text_arff/* +**/java_files/* +**/text_arff/* # Weka files -weka_files/* +**/weka_files/* # Keep readme files !**/README.md diff --git a/JavaExtractor/JPredict/target/JavaExtractor-0.0.1-SNAPSHOT.jar b/JavaExtractor/JPredict/target/JavaExtractor-0.0.1-SNAPSHOT.jar deleted file mode 100644 index 2b6ac72..0000000 Binary files a/JavaExtractor/JPredict/target/JavaExtractor-0.0.1-SNAPSHOT.jar and /dev/null differ diff --git a/JavaExtractor/JPredict/target/classes/JavaExtractor/App.class b/JavaExtractor/JPredict/target/classes/JavaExtractor/App.class deleted file mode 100644 index b75a6b8..0000000 Binary files a/JavaExtractor/JPredict/target/classes/JavaExtractor/App.class and /dev/null differ diff --git a/JavaExtractor/JPredict/target/classes/JavaExtractor/Common/CommandLineValues.class b/JavaExtractor/JPredict/target/classes/JavaExtractor/Common/CommandLineValues.class deleted file mode 100644 index ef75349..0000000 Binary files a/JavaExtractor/JPredict/target/classes/JavaExtractor/Common/CommandLineValues.class and /dev/null differ diff --git a/JavaExtractor/JPredict/target/classes/JavaExtractor/Common/Common$1.class b/JavaExtractor/JPredict/target/classes/JavaExtractor/Common/Common$1.class deleted file mode 100644 index d96d9df..0000000 Binary files a/JavaExtractor/JPredict/target/classes/JavaExtractor/Common/Common$1.class and /dev/null differ diff --git a/JavaExtractor/JPredict/target/classes/JavaExtractor/Common/Common$2.class b/JavaExtractor/JPredict/target/classes/JavaExtractor/Common/Common$2.class deleted file mode 100644 index 7c0940c..0000000 Binary files a/JavaExtractor/JPredict/target/classes/JavaExtractor/Common/Common$2.class and /dev/null differ diff --git a/JavaExtractor/JPredict/target/classes/JavaExtractor/Common/Common$3.class b/JavaExtractor/JPredict/target/classes/JavaExtractor/Common/Common$3.class deleted file mode 100644 index 7cddbd9..0000000 Binary files a/JavaExtractor/JPredict/target/classes/JavaExtractor/Common/Common$3.class and /dev/null differ diff --git a/JavaExtractor/JPredict/target/classes/JavaExtractor/Common/Common.class b/JavaExtractor/JPredict/target/classes/JavaExtractor/Common/Common.class deleted file mode 100644 index c61e9b6..0000000 Binary files a/JavaExtractor/JPredict/target/classes/JavaExtractor/Common/Common.class and /dev/null differ diff --git a/JavaExtractor/JPredict/target/classes/JavaExtractor/Common/MethodContent.class b/JavaExtractor/JPredict/target/classes/JavaExtractor/Common/MethodContent.class deleted file mode 100644 index 8c5a874..0000000 Binary files a/JavaExtractor/JPredict/target/classes/JavaExtractor/Common/MethodContent.class and /dev/null differ diff --git a/JavaExtractor/JPredict/target/classes/JavaExtractor/ExtractFeaturesTask.class b/JavaExtractor/JPredict/target/classes/JavaExtractor/ExtractFeaturesTask.class deleted file mode 100644 index 3468735..0000000 Binary files a/JavaExtractor/JPredict/target/classes/JavaExtractor/ExtractFeaturesTask.class and /dev/null differ diff --git a/JavaExtractor/JPredict/target/classes/JavaExtractor/FeatureExtractor.class b/JavaExtractor/JPredict/target/classes/JavaExtractor/FeatureExtractor.class deleted file mode 100644 index 7e6dbe6..0000000 Binary files a/JavaExtractor/JPredict/target/classes/JavaExtractor/FeatureExtractor.class and /dev/null differ diff --git a/JavaExtractor/JPredict/target/classes/JavaExtractor/FeaturesEntities/ProgramFeatures.class b/JavaExtractor/JPredict/target/classes/JavaExtractor/FeaturesEntities/ProgramFeatures.class deleted file mode 100644 index 026ea7a..0000000 Binary files a/JavaExtractor/JPredict/target/classes/JavaExtractor/FeaturesEntities/ProgramFeatures.class and /dev/null differ diff --git a/JavaExtractor/JPredict/target/classes/JavaExtractor/FeaturesEntities/ProgramNode.class b/JavaExtractor/JPredict/target/classes/JavaExtractor/FeaturesEntities/ProgramNode.class deleted file mode 100644 index 54e83de..0000000 Binary files a/JavaExtractor/JPredict/target/classes/JavaExtractor/FeaturesEntities/ProgramNode.class and /dev/null differ diff --git a/JavaExtractor/JPredict/target/classes/JavaExtractor/FeaturesEntities/ProgramRelation.class b/JavaExtractor/JPredict/target/classes/JavaExtractor/FeaturesEntities/ProgramRelation.class deleted file mode 100644 index 2eb40ba..0000000 Binary files a/JavaExtractor/JPredict/target/classes/JavaExtractor/FeaturesEntities/ProgramRelation.class and /dev/null differ diff --git a/JavaExtractor/JPredict/target/classes/JavaExtractor/FeaturesEntities/Property.class b/JavaExtractor/JPredict/target/classes/JavaExtractor/FeaturesEntities/Property.class deleted file mode 100644 index e7a1b6a..0000000 Binary files a/JavaExtractor/JPredict/target/classes/JavaExtractor/FeaturesEntities/Property.class and /dev/null differ diff --git a/JavaExtractor/JPredict/target/classes/JavaExtractor/Visitors/FunctionVisitor.class b/JavaExtractor/JPredict/target/classes/JavaExtractor/Visitors/FunctionVisitor.class deleted file mode 100644 index 70c6d53..0000000 Binary files a/JavaExtractor/JPredict/target/classes/JavaExtractor/Visitors/FunctionVisitor.class and /dev/null differ diff --git a/JavaExtractor/JPredict/target/classes/JavaExtractor/Visitors/LeavesCollectorVisitor.class b/JavaExtractor/JPredict/target/classes/JavaExtractor/Visitors/LeavesCollectorVisitor.class deleted file mode 100644 index ec51674..0000000 Binary files a/JavaExtractor/JPredict/target/classes/JavaExtractor/Visitors/LeavesCollectorVisitor.class and /dev/null differ diff --git a/README.md b/README.md index a7a2e56..7c46715 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Obsucated code2vec: Improving Generalisation by Hiding Information +# Obsucated code2vec: Reducing Model Bias by Hiding Information ![Overall project view](img/overall.png) @@ -11,18 +11,37 @@ All of the model-related code (`common.py`, `model.py`, `PathContextReader.py`) All models/datasets are on the paper google drive folder https://drive.google.com/drive/u/1/folders/1CXgSXKf292BTlryASui2kBvYvJSvFnWN +## Requirements +- Java 8+ +- Python 3 + +## Usage - Obfuscator +These steps should all be run from within the `java-obfuscator/` directory. +1. Locate a folder of `.java` files (e.g., from the [code2seq](https://github.com/tech-srl/code2seq) repository) +2. Alter the input and output directories in `obfs-script.sh`, as well as the number of threads of your machine. If you're running this on a particularly large folder (e.g., millions of files) then you may need to increase the `NUM_PARTITIONS` to 3 or 4, otherwise memory issues can occur, grinding the obfuscator to a near halt. +3. Run `obfs-script.sh` i.e. `$ source obfs-script.sh` + +This will result in a new obfuscated folder of `.java` files, that can be used to train a new obfuscated code2vec model (or any model that performs learning from source code for that matter). + ## Usage - Dataset Pipeline ![Dataset Pipeline View](img/pipeline.png) +These steps will convert a dataset of `.java` files into a numerical form (`.arff` by default), that can then be used with any standard WEKA classifier. + +These steps should all be run from within the `pipeline/` directory of this repository. To run the dataset pipeline and create class-level embeddings for a dataset of Java files: +1. `cd pipeline` +2. `pip install -r requirements.txt` 1. Download a `.java` dataset (from the datasets supplied or your own) and put in the `java_files/` directory 2. Download a code2vec model checkpoint and put the checkpoint folder in the `models/` directory -3. Change the paths and definitions in `model_defs.py` and number of models in `create_datasets.sh` to match your setup -4. Run `create_datasets.sh`. This will loop through each model and create class-level embeddings for the supplied datasets. The resulting datasets will be in `.arff` format in the `weka_files/` folder +3. Change the paths and definitions in `model_defs.py` and number of models in `scripts/create_datasets.sh` to match your setup +4. Run `create_datasets.sh` (`source scripts/create_datasets.sh`). This will loop through each model and create class-level embeddings for the supplied datasets. The resulting datasets will be in `.arff` format in the `weka_files/` folder. + +You can now perform class-level classification on the dataset using any off-the-shelf classifier. ### Config -By default the pipeline will use the full range of values for each parameter, which creates a huge number of resulting `.arff` datasets (>1000). To reduce the number of these, remove (or comment out) some of the items in the arrays in `reduction_methods.py` and `selection_methods.py` (at the end of the file). Our experiments showed that the `SelectAll` selection method and `NoReduction` reduction method performed best in most cases so you may want to keep only these. +By default the pipeline will use the full range of values for each parameter, which creates a huge number of resulting `.arff` datasets (>1000). To reduce the number of these, remove (or comment out) some of the items in the arrays in `reduction_methods.py` and `selection_methods.py` (at the end of the file). Our experiments showed that the `SelectAll` selection method and `NoReduction` reduction method performed best in most cases so you may want to just keep these. ## Datasets @@ -52,6 +71,8 @@ The `.java` files are all [available for download](https://drive.google.com/driv 13 categories, 1062 instances +This dataset was collected using the [github-scraper](https://github.com/basedrhys/github-scraper) python tool, which makes it easy to download specific types of files from github repos (`.java` files in this case). + [Google Drive Link](https://drive.google.com/open?id=1IC0Nxeew73p9yvfhKcKH-6mxW8nHGyfn) [Embedding Visualisation](http://projector.tensorflow.org/?config=https://gist.githubusercontent.com/basedrhys/36fcd8653f2d759a8f1b03e56502a58e/raw/7d2ddef1c219d4fad7a49cc2c978d1ff4e25e5f1/author_config.json) diff --git a/java-tool.jar b/java-tool.jar deleted file mode 100644 index 0d58838..0000000 Binary files a/java-tool.jar and /dev/null differ diff --git a/ClassPreprocessor.py b/pipeline/ClassPreprocessor.py similarity index 100% rename from ClassPreprocessor.py rename to pipeline/ClassPreprocessor.py diff --git a/JavaExtractor/JPredict/.classpath b/pipeline/JavaExtractor/JPredict/.classpath old mode 100755 new mode 100644 similarity index 100% rename from JavaExtractor/JPredict/.classpath rename to pipeline/JavaExtractor/JPredict/.classpath diff --git a/JavaExtractor/JPredict/.gitignore b/pipeline/JavaExtractor/JPredict/.gitignore old mode 100755 new mode 100644 similarity index 100% rename from JavaExtractor/JPredict/.gitignore rename to pipeline/JavaExtractor/JPredict/.gitignore diff --git a/JavaExtractor/JPredict/.project b/pipeline/JavaExtractor/JPredict/.project old mode 100755 new mode 100644 similarity index 95% rename from JavaExtractor/JPredict/.project rename to pipeline/JavaExtractor/JPredict/.project index fee6c60..9ca9d79 --- a/JavaExtractor/JPredict/.project +++ b/pipeline/JavaExtractor/JPredict/.project @@ -1,23 +1,23 @@ - - - JavaExtractor - - - - - - org.eclipse.jdt.core.javabuilder - - - - - org.eclipse.m2e.core.maven2Builder - - - - - - org.eclipse.jdt.core.javanature - org.eclipse.m2e.core.maven2Nature - - + + + JavaExtractor + + + + + + org.eclipse.jdt.core.javabuilder + + + + + org.eclipse.m2e.core.maven2Builder + + + + + + org.eclipse.jdt.core.javanature + org.eclipse.m2e.core.maven2Nature + + diff --git a/JavaExtractor/JPredict/.settings/org.eclipse.core.resources.prefs b/pipeline/JavaExtractor/JPredict/.settings/org.eclipse.core.resources.prefs old mode 100755 new mode 100644 similarity index 96% rename from JavaExtractor/JPredict/.settings/org.eclipse.core.resources.prefs rename to pipeline/JavaExtractor/JPredict/.settings/org.eclipse.core.resources.prefs index 654c175..e9441bb --- a/JavaExtractor/JPredict/.settings/org.eclipse.core.resources.prefs +++ b/pipeline/JavaExtractor/JPredict/.settings/org.eclipse.core.resources.prefs @@ -1,3 +1,3 @@ -eclipse.preferences.version=1 -encoding//src/main/java=UTF-8 -encoding/=UTF-8 +eclipse.preferences.version=1 +encoding//src/main/java=UTF-8 +encoding/=UTF-8 diff --git a/JavaExtractor/JPredict/.settings/org.eclipse.jdt.apt.core.prefs b/pipeline/JavaExtractor/JPredict/.settings/org.eclipse.jdt.apt.core.prefs similarity index 100% rename from JavaExtractor/JPredict/.settings/org.eclipse.jdt.apt.core.prefs rename to pipeline/JavaExtractor/JPredict/.settings/org.eclipse.jdt.apt.core.prefs diff --git a/JavaExtractor/JPredict/.settings/org.eclipse.jdt.core.prefs b/pipeline/JavaExtractor/JPredict/.settings/org.eclipse.jdt.core.prefs old mode 100755 new mode 100644 similarity index 98% rename from JavaExtractor/JPredict/.settings/org.eclipse.jdt.core.prefs rename to pipeline/JavaExtractor/JPredict/.settings/org.eclipse.jdt.core.prefs index c79b505..ea7a397 --- a/JavaExtractor/JPredict/.settings/org.eclipse.jdt.core.prefs +++ b/pipeline/JavaExtractor/JPredict/.settings/org.eclipse.jdt.core.prefs @@ -1,16 +1,16 @@ -eclipse.preferences.version=1 -org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled -org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8 -org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve -org.eclipse.jdt.core.compiler.compliance=1.8 -org.eclipse.jdt.core.compiler.debug.lineNumber=generate -org.eclipse.jdt.core.compiler.debug.localVariable=generate -org.eclipse.jdt.core.compiler.debug.sourceFile=generate -org.eclipse.jdt.core.compiler.problem.assertIdentifier=error -org.eclipse.jdt.core.compiler.problem.enablePreviewFeatures=disabled -org.eclipse.jdt.core.compiler.problem.enumIdentifier=error -org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning -org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=ignore -org.eclipse.jdt.core.compiler.processAnnotations=disabled -org.eclipse.jdt.core.compiler.release=disabled -org.eclipse.jdt.core.compiler.source=1.8 +eclipse.preferences.version=1 +org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled +org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8 +org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve +org.eclipse.jdt.core.compiler.compliance=1.8 +org.eclipse.jdt.core.compiler.debug.lineNumber=generate +org.eclipse.jdt.core.compiler.debug.localVariable=generate +org.eclipse.jdt.core.compiler.debug.sourceFile=generate +org.eclipse.jdt.core.compiler.problem.assertIdentifier=error +org.eclipse.jdt.core.compiler.problem.enablePreviewFeatures=disabled +org.eclipse.jdt.core.compiler.problem.enumIdentifier=error +org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning +org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=ignore +org.eclipse.jdt.core.compiler.processAnnotations=disabled +org.eclipse.jdt.core.compiler.release=disabled +org.eclipse.jdt.core.compiler.source=1.8 diff --git a/JavaExtractor/JPredict/.settings/org.eclipse.m2e.core.prefs b/pipeline/JavaExtractor/JPredict/.settings/org.eclipse.m2e.core.prefs old mode 100755 new mode 100644 similarity index 95% rename from JavaExtractor/JPredict/.settings/org.eclipse.m2e.core.prefs rename to pipeline/JavaExtractor/JPredict/.settings/org.eclipse.m2e.core.prefs index 14b697b..f897a7f --- a/JavaExtractor/JPredict/.settings/org.eclipse.m2e.core.prefs +++ b/pipeline/JavaExtractor/JPredict/.settings/org.eclipse.m2e.core.prefs @@ -1,4 +1,4 @@ -activeProfiles= -eclipse.preferences.version=1 -resolveWorkspaceProjects=true -version=1 +activeProfiles= +eclipse.preferences.version=1 +resolveWorkspaceProjects=true +version=1 diff --git a/JavaExtractor/JPredict/pom.xml b/pipeline/JavaExtractor/JPredict/pom.xml similarity index 96% rename from JavaExtractor/JPredict/pom.xml rename to pipeline/JavaExtractor/JPredict/pom.xml index 6e3b71d..91af533 100644 --- a/JavaExtractor/JPredict/pom.xml +++ b/pipeline/JavaExtractor/JPredict/pom.xml @@ -1,75 +1,75 @@ - - - 4.0.0 - JavaExtractor - JavaExtractor - JPredict - 0.0.1-SNAPSHOT - http://maven.apache.org - - - - maven-compiler-plugin - 3.2 - - 1.8 - 1.8 - - Test.java - - - - - maven-shade-plugin - 2.1 - - - package - - shade - - - - - - - - - - - - - - - com.github.javaparser - javaparser-core - 3.0.0-alpha.4 - - - commons-io - commons-io - 1.3.2 - compile - - - com.fasterxml.jackson.core - jackson-databind - 2.9.10.1 - - - args4j - args4j - 2.33 - - - org.apache.commons - commons-lang3 - 3.5 - - - - UTF-8 - - - + + + 4.0.0 + JavaExtractor + JavaExtractor + JPredict + 0.0.1-SNAPSHOT + http://maven.apache.org + + + + maven-compiler-plugin + 3.2 + + 1.8 + 1.8 + + Test.java + + + + + maven-shade-plugin + 2.1 + + + package + + shade + + + + + + + + + + + + + + + com.github.javaparser + javaparser-core + 3.0.0-alpha.4 + + + commons-io + commons-io + 1.3.2 + compile + + + com.fasterxml.jackson.core + jackson-databind + 2.9.10.1 + + + args4j + args4j + 2.33 + + + org.apache.commons + commons-lang3 + 3.5 + + + + UTF-8 + + + diff --git a/JavaExtractor/JPredict/src/main/java/JavaExtractor/App.java b/pipeline/JavaExtractor/JPredict/src/main/java/JavaExtractor/App.java old mode 100755 new mode 100644 similarity index 96% rename from JavaExtractor/JPredict/src/main/java/JavaExtractor/App.java rename to pipeline/JavaExtractor/JPredict/src/main/java/JavaExtractor/App.java index 778680c..e96b75c --- a/JavaExtractor/JPredict/src/main/java/JavaExtractor/App.java +++ b/pipeline/JavaExtractor/JPredict/src/main/java/JavaExtractor/App.java @@ -1,60 +1,60 @@ -package JavaExtractor; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Paths; -import java.util.LinkedList; -import java.util.concurrent.Executors; -import java.util.concurrent.ThreadPoolExecutor; - -import org.kohsuke.args4j.CmdLineException; - -import JavaExtractor.Common.CommandLineValues; -import JavaExtractor.FeaturesEntities.ProgramRelation; - -public class App { - private static CommandLineValues s_CommandLineValues; - - public static void main(String[] args) { - try { - s_CommandLineValues = new CommandLineValues(args); - } catch (CmdLineException e) { - e.printStackTrace(); - return; - } - - if (s_CommandLineValues.NoHash) { - ProgramRelation.setNoHash(); - } - - if (s_CommandLineValues.File != null) { - ExtractFeaturesTask extractFeaturesTask = new ExtractFeaturesTask(s_CommandLineValues, - s_CommandLineValues.File.toPath()); - extractFeaturesTask.processFile(); - } else if (s_CommandLineValues.Dir != null) { - extractDir(); - } - } - - private static void extractDir() { - ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(s_CommandLineValues.NumThreads); - LinkedList tasks = new LinkedList<>(); - try { - Files.walk(Paths.get(s_CommandLineValues.Dir)).filter(Files::isRegularFile) - .filter(p -> p.toString().toLowerCase().endsWith(".java")).forEach(f -> { - ExtractFeaturesTask task = new ExtractFeaturesTask(s_CommandLineValues, f); - tasks.add(task); - }); - } catch (IOException e) { - e.printStackTrace(); - return; - } - try { - executor.invokeAll(tasks); - } catch (InterruptedException e) { - e.printStackTrace(); - } finally { - executor.shutdown(); - } - } -} +package JavaExtractor; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.LinkedList; +import java.util.concurrent.Executors; +import java.util.concurrent.ThreadPoolExecutor; + +import org.kohsuke.args4j.CmdLineException; + +import JavaExtractor.Common.CommandLineValues; +import JavaExtractor.FeaturesEntities.ProgramRelation; + +public class App { + private static CommandLineValues s_CommandLineValues; + + public static void main(String[] args) { + try { + s_CommandLineValues = new CommandLineValues(args); + } catch (CmdLineException e) { + e.printStackTrace(); + return; + } + + if (s_CommandLineValues.NoHash) { + ProgramRelation.setNoHash(); + } + + if (s_CommandLineValues.File != null) { + ExtractFeaturesTask extractFeaturesTask = new ExtractFeaturesTask(s_CommandLineValues, + s_CommandLineValues.File.toPath()); + extractFeaturesTask.processFile(); + } else if (s_CommandLineValues.Dir != null) { + extractDir(); + } + } + + private static void extractDir() { + ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(s_CommandLineValues.NumThreads); + LinkedList tasks = new LinkedList<>(); + try { + Files.walk(Paths.get(s_CommandLineValues.Dir)).filter(Files::isRegularFile) + .filter(p -> p.toString().toLowerCase().endsWith(".java")).forEach(f -> { + ExtractFeaturesTask task = new ExtractFeaturesTask(s_CommandLineValues, f); + tasks.add(task); + }); + } catch (IOException e) { + e.printStackTrace(); + return; + } + try { + executor.invokeAll(tasks); + } catch (InterruptedException e) { + e.printStackTrace(); + } finally { + executor.shutdown(); + } + } +} diff --git a/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/CommandLineValues.java b/pipeline/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/CommandLineValues.java old mode 100755 new mode 100644 similarity index 96% rename from JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/CommandLineValues.java rename to pipeline/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/CommandLineValues.java index 3cd3ff5..c5b3cdb --- a/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/CommandLineValues.java +++ b/pipeline/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/CommandLineValues.java @@ -1,56 +1,56 @@ -package JavaExtractor.Common; - -import java.io.File; -import org.kohsuke.args4j.CmdLineException; -import org.kohsuke.args4j.CmdLineParser; -import org.kohsuke.args4j.Option; - -/** - * This class handles the programs arguments. - */ -public class CommandLineValues { - @Option(name = "--file", required = false) - public File File = null; - - @Option(name = "--dir", required = false, forbids = "--file") - public String Dir = null; - - @Option(name = "--max_path_length", required = true) - public int MaxPathLength; - - @Option(name = "--max_path_width", required = true) - public int MaxPathWidth; - - @Option(name = "--no_hash", required = false) - public boolean NoHash = false; - - @Option(name = "--num_threads", required = false) - public int NumThreads = 32; - - @Option(name = "--min_code_len", required = false) - public int MinCodeLength = 1; - - @Option(name = "--max_code_len", required = false) - public int MaxCodeLength = 10000; - - @Option(name = "--pretty_print", required = false) - public boolean PrettyPrint = false; - - @Option(name = "--max_child_id", required = false) - public int MaxChildId = Integer.MAX_VALUE; - - public CommandLineValues(String... args) throws CmdLineException { - CmdLineParser parser = new CmdLineParser(this); - try { - parser.parseArgument(args); - } catch (CmdLineException e) { - System.err.println(e.getMessage()); - parser.printUsage(System.err); - throw e; - } - } - - public CommandLineValues() { - - } +package JavaExtractor.Common; + +import java.io.File; +import org.kohsuke.args4j.CmdLineException; +import org.kohsuke.args4j.CmdLineParser; +import org.kohsuke.args4j.Option; + +/** + * This class handles the programs arguments. + */ +public class CommandLineValues { + @Option(name = "--file", required = false) + public File File = null; + + @Option(name = "--dir", required = false, forbids = "--file") + public String Dir = null; + + @Option(name = "--max_path_length", required = true) + public int MaxPathLength; + + @Option(name = "--max_path_width", required = true) + public int MaxPathWidth; + + @Option(name = "--no_hash", required = false) + public boolean NoHash = false; + + @Option(name = "--num_threads", required = false) + public int NumThreads = 32; + + @Option(name = "--min_code_len", required = false) + public int MinCodeLength = 1; + + @Option(name = "--max_code_len", required = false) + public int MaxCodeLength = 10000; + + @Option(name = "--pretty_print", required = false) + public boolean PrettyPrint = false; + + @Option(name = "--max_child_id", required = false) + public int MaxChildId = Integer.MAX_VALUE; + + public CommandLineValues(String... args) throws CmdLineException { + CmdLineParser parser = new CmdLineParser(this); + try { + parser.parseArgument(args); + } catch (CmdLineException e) { + System.err.println(e.getMessage()); + parser.printUsage(System.err); + throw e; + } + } + + public CommandLineValues() { + + } } \ No newline at end of file diff --git a/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/Common.java b/pipeline/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/Common.java old mode 100755 new mode 100644 similarity index 97% rename from JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/Common.java rename to pipeline/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/Common.java index 7b2e3d6..8f5acfd --- a/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/Common.java +++ b/pipeline/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/Common.java @@ -1,77 +1,77 @@ -package JavaExtractor.Common; - -import java.util.ArrayList; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -import com.github.javaparser.ast.Node; -import com.github.javaparser.ast.UserDataKey; - -import JavaExtractor.FeaturesEntities.ProgramNode; -import JavaExtractor.FeaturesEntities.Property; - -public final class Common { - public static final UserDataKey PropertyKey = new UserDataKey() { - }; - public static final UserDataKey ProgramNodeKey = new UserDataKey() { - }; - public static final UserDataKey ChildId = new UserDataKey() { - }; - public static final String EmptyString = ""; - public static final String UTF8 = "UTF-8"; - public static final String EvaluateTempDir = "EvalTemp"; - - public static final String FieldAccessExpr = "FieldAccessExpr"; - public static final String ClassOrInterfaceType = "ClassOrInterfaceType"; - public static final String MethodDeclaration = "MethodDeclaration"; - public static final String NameExpr = "NameExpr"; - public static final String MethodCallExpr = "MethodCallExpr"; - public static final String DummyNode = "DummyNode"; - public static final String BlankWord = "BLANK"; - - public static final int c_MaxLabelLength = 50; - public static final String methodName = "METHOD_NAME"; - public static final String internalSeparator = "|"; - - public static String normalizeName(String original, String defaultString) { - original = original.toLowerCase().replaceAll("\\\\n", "") // escaped new - // lines - .replaceAll("//s+", "") // whitespaces - .replaceAll("[\"',]", "") // quotes, apostrophies, commas - .replaceAll("\\P{Print}", ""); // unicode weird characters - String stripped = original.replaceAll("[^A-Za-z]", ""); - if (stripped.length() == 0) { - String carefulStripped = original.replaceAll(" ", "_"); - if (carefulStripped.length() == 0) { - return defaultString; - } else { - return carefulStripped; - } - } else { - return stripped; - } - } - - public static boolean isMethod(Node node) { - String type = node.getUserData(Common.PropertyKey).getType(); - - return isMethod(node, type); - } - - public static boolean isMethod(Node node, String type) { - Property parentProperty = node.getParentNode().getUserData(Common.PropertyKey); - if (parentProperty == null) { - return false; - } - - String parentType = parentProperty.getType(); - return Common.NameExpr.equals(type) && Common.MethodDeclaration.equals(parentType); - } - - public static ArrayList splitToSubtokens(String str1) { - String str2 = str1.trim(); - return Stream.of(str2.split("(?<=[a-z])(?=[A-Z])|_|[0-9]|(?<=[A-Z])(?=[A-Z][a-z])|\\s+")) - .filter(s -> s.length() > 0).map(s -> Common.normalizeName(s, Common.EmptyString)) - .filter(s -> s.length() > 0).collect(Collectors.toCollection(ArrayList::new)); - } -} +package JavaExtractor.Common; + +import java.util.ArrayList; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import com.github.javaparser.ast.Node; +import com.github.javaparser.ast.UserDataKey; + +import JavaExtractor.FeaturesEntities.ProgramNode; +import JavaExtractor.FeaturesEntities.Property; + +public final class Common { + public static final UserDataKey PropertyKey = new UserDataKey() { + }; + public static final UserDataKey ProgramNodeKey = new UserDataKey() { + }; + public static final UserDataKey ChildId = new UserDataKey() { + }; + public static final String EmptyString = ""; + public static final String UTF8 = "UTF-8"; + public static final String EvaluateTempDir = "EvalTemp"; + + public static final String FieldAccessExpr = "FieldAccessExpr"; + public static final String ClassOrInterfaceType = "ClassOrInterfaceType"; + public static final String MethodDeclaration = "MethodDeclaration"; + public static final String NameExpr = "NameExpr"; + public static final String MethodCallExpr = "MethodCallExpr"; + public static final String DummyNode = "DummyNode"; + public static final String BlankWord = "BLANK"; + + public static final int c_MaxLabelLength = 50; + public static final String methodName = "METHOD_NAME"; + public static final String internalSeparator = "|"; + + public static String normalizeName(String original, String defaultString) { + original = original.toLowerCase().replaceAll("\\\\n", "") // escaped new + // lines + .replaceAll("//s+", "") // whitespaces + .replaceAll("[\"',]", "") // quotes, apostrophies, commas + .replaceAll("\\P{Print}", ""); // unicode weird characters + String stripped = original.replaceAll("[^A-Za-z]", ""); + if (stripped.length() == 0) { + String carefulStripped = original.replaceAll(" ", "_"); + if (carefulStripped.length() == 0) { + return defaultString; + } else { + return carefulStripped; + } + } else { + return stripped; + } + } + + public static boolean isMethod(Node node) { + String type = node.getUserData(Common.PropertyKey).getType(); + + return isMethod(node, type); + } + + public static boolean isMethod(Node node, String type) { + Property parentProperty = node.getParentNode().getUserData(Common.PropertyKey); + if (parentProperty == null) { + return false; + } + + String parentType = parentProperty.getType(); + return Common.NameExpr.equals(type) && Common.MethodDeclaration.equals(parentType); + } + + public static ArrayList splitToSubtokens(String str1) { + String str2 = str1.trim(); + return Stream.of(str2.split("(?<=[a-z])(?=[A-Z])|_|[0-9]|(?<=[A-Z])(?=[A-Z][a-z])|\\s+")) + .filter(s -> s.length() > 0).map(s -> Common.normalizeName(s, Common.EmptyString)) + .filter(s -> s.length() > 0).collect(Collectors.toCollection(ArrayList::new)); + } +} diff --git a/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/MethodContent.java b/pipeline/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/MethodContent.java old mode 100755 new mode 100644 similarity index 94% rename from JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/MethodContent.java rename to pipeline/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/MethodContent.java index 4df3cfd..81b5f89 --- a/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/MethodContent.java +++ b/pipeline/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/MethodContent.java @@ -1,29 +1,29 @@ -package JavaExtractor.Common; - -import java.util.ArrayList; -import com.github.javaparser.ast.Node; - -public class MethodContent { - private ArrayList leaves; - private String name; - private long length; - - public MethodContent(ArrayList leaves, String name, long length) { - this.leaves = leaves; - this.name = name; - this.length = length; - } - - public ArrayList getLeaves() { - return leaves; - } - - public String getName() { - return name; - } - - public long getLength() { - return length; - } - -} +package JavaExtractor.Common; + +import java.util.ArrayList; +import com.github.javaparser.ast.Node; + +public class MethodContent { + private ArrayList leaves; + private String name; + private long length; + + public MethodContent(ArrayList leaves, String name, long length) { + this.leaves = leaves; + this.name = name; + this.length = length; + } + + public ArrayList getLeaves() { + return leaves; + } + + public String getName() { + return name; + } + + public long getLength() { + return length; + } + +} diff --git a/JavaExtractor/JPredict/src/main/java/JavaExtractor/ExtractFeaturesTask.java b/pipeline/JavaExtractor/JPredict/src/main/java/JavaExtractor/ExtractFeaturesTask.java old mode 100755 new mode 100644 similarity index 96% rename from JavaExtractor/JPredict/src/main/java/JavaExtractor/ExtractFeaturesTask.java rename to pipeline/JavaExtractor/JPredict/src/main/java/JavaExtractor/ExtractFeaturesTask.java index 3eaf2a7..41512c8 --- a/JavaExtractor/JPredict/src/main/java/JavaExtractor/ExtractFeaturesTask.java +++ b/pipeline/JavaExtractor/JPredict/src/main/java/JavaExtractor/ExtractFeaturesTask.java @@ -1,92 +1,92 @@ -package JavaExtractor; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.ArrayList; -import java.util.List; -import java.util.concurrent.Callable; - -import org.apache.commons.lang3.StringUtils; - -import com.github.javaparser.ParseException; -import com.github.javaparser.ast.CompilationUnit; - -import JavaExtractor.Common.CommandLineValues; -import JavaExtractor.Common.Common; -import JavaExtractor.FeaturesEntities.ProgramFeatures; - -public class ExtractFeaturesTask implements Callable { - CommandLineValues m_CommandLineValues; - Path filePath; - - public ExtractFeaturesTask(CommandLineValues commandLineValues, Path path) { - m_CommandLineValues = commandLineValues; - this.filePath = path; - } - - @Override - public Void call() throws Exception { - //System.err.println("Extracting file: " + filePath); - processFile(); - //System.err.println("Done with file: " + filePath); - return null; - } - - public void processFile() { - ArrayList features; - try { - features = extractSingleFile(); - } catch (ParseException | IOException e) { - e.printStackTrace(); - return; - } - if (features == null) { - return; - } - - String toPrint = featuresToString(features); - if (toPrint.length() > 0) { - System.out.println(toPrint); - } - } - - public ArrayList extractSingleFile() throws ParseException, IOException { - String code = null; - try { - code = new String(Files.readAllBytes(this.filePath)); - } catch (IOException e) { - e.printStackTrace(); - code = Common.EmptyString; - } - FeatureExtractor featureExtractor = new FeatureExtractor(m_CommandLineValues); - - ArrayList features = featureExtractor.extractFeatures(code); - - return features; - } - - public String featuresToString(ArrayList features) { - if (features == null || features.isEmpty()) { - return Common.EmptyString; - } - - List methodsOutputs = new ArrayList<>(); - - for (ProgramFeatures singleMethodfeatures : features) { - StringBuilder builder = new StringBuilder(); - - String toPrint = Common.EmptyString; - toPrint = singleMethodfeatures.toString(); - if (m_CommandLineValues.PrettyPrint) { - toPrint = toPrint.replace(" ", "\n\t"); - } - builder.append(toPrint); - - - methodsOutputs.add(builder.toString()); - - } - return StringUtils.join(methodsOutputs, "\n"); - } -} +package JavaExtractor; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.Callable; + +import org.apache.commons.lang3.StringUtils; + +import com.github.javaparser.ParseException; +import com.github.javaparser.ast.CompilationUnit; + +import JavaExtractor.Common.CommandLineValues; +import JavaExtractor.Common.Common; +import JavaExtractor.FeaturesEntities.ProgramFeatures; + +public class ExtractFeaturesTask implements Callable { + CommandLineValues m_CommandLineValues; + Path filePath; + + public ExtractFeaturesTask(CommandLineValues commandLineValues, Path path) { + m_CommandLineValues = commandLineValues; + this.filePath = path; + } + + @Override + public Void call() throws Exception { + //System.err.println("Extracting file: " + filePath); + processFile(); + //System.err.println("Done with file: " + filePath); + return null; + } + + public void processFile() { + ArrayList features; + try { + features = extractSingleFile(); + } catch (ParseException | IOException e) { + e.printStackTrace(); + return; + } + if (features == null) { + return; + } + + String toPrint = featuresToString(features); + if (toPrint.length() > 0) { + System.out.println(toPrint); + } + } + + public ArrayList extractSingleFile() throws ParseException, IOException { + String code = null; + try { + code = new String(Files.readAllBytes(this.filePath)); + } catch (IOException e) { + e.printStackTrace(); + code = Common.EmptyString; + } + FeatureExtractor featureExtractor = new FeatureExtractor(m_CommandLineValues); + + ArrayList features = featureExtractor.extractFeatures(code); + + return features; + } + + public String featuresToString(ArrayList features) { + if (features == null || features.isEmpty()) { + return Common.EmptyString; + } + + List methodsOutputs = new ArrayList<>(); + + for (ProgramFeatures singleMethodfeatures : features) { + StringBuilder builder = new StringBuilder(); + + String toPrint = Common.EmptyString; + toPrint = singleMethodfeatures.toString(); + if (m_CommandLineValues.PrettyPrint) { + toPrint = toPrint.replace(" ", "\n\t"); + } + builder.append(toPrint); + + + methodsOutputs.add(builder.toString()); + + } + return StringUtils.join(methodsOutputs, "\n"); + } +} diff --git a/JavaExtractor/JPredict/src/main/java/JavaExtractor/FeatureExtractor.java b/pipeline/JavaExtractor/JPredict/src/main/java/JavaExtractor/FeatureExtractor.java old mode 100755 new mode 100644 similarity index 97% rename from JavaExtractor/JPredict/src/main/java/JavaExtractor/FeatureExtractor.java rename to pipeline/JavaExtractor/JPredict/src/main/java/JavaExtractor/FeatureExtractor.java index 1a0e1d5..626bda2 --- a/JavaExtractor/JPredict/src/main/java/JavaExtractor/FeatureExtractor.java +++ b/pipeline/JavaExtractor/JPredict/src/main/java/JavaExtractor/FeatureExtractor.java @@ -1,196 +1,196 @@ -package JavaExtractor; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashSet; -import java.util.Set; -import java.util.StringJoiner; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -import com.github.javaparser.JavaParser; -import com.github.javaparser.ParseException; -import com.github.javaparser.ParseProblemException; -import com.github.javaparser.ast.CompilationUnit; -import com.github.javaparser.ast.Node; -import JavaExtractor.Common.CommandLineValues; -import JavaExtractor.Common.Common; -import JavaExtractor.Common.MethodContent; -import JavaExtractor.FeaturesEntities.ProgramFeatures; -import JavaExtractor.FeaturesEntities.Property; -import JavaExtractor.Visitors.FunctionVisitor; - -@SuppressWarnings("StringEquality") -public class FeatureExtractor { - private CommandLineValues m_CommandLineValues; - private static Set s_ParentTypeToAddChildId = Stream - .of("AssignExpr", "ArrayAccessExpr", "FieldAccessExpr", "MethodCallExpr") - .collect(Collectors.toCollection(HashSet::new)); - - final static String lparen = "("; - final static String rparen = ")"; - final static String upSymbol = "^"; - final static String downSymbol = "_"; - - public FeatureExtractor(CommandLineValues commandLineValues) { - this.m_CommandLineValues = commandLineValues; - } - - public ArrayList extractFeatures(String code) throws ParseException, IOException { - CompilationUnit compilationUnit = parseFileWithRetries(code); - FunctionVisitor functionVisitor = new FunctionVisitor(); - - functionVisitor.visit(compilationUnit, null); - - ArrayList methods = functionVisitor.getMethodContents(); - ArrayList programs = generatePathFeatures(methods); - - return programs; - } - - private CompilationUnit parseFileWithRetries(String code) throws IOException { - final String classPrefix = "public class Test {"; - final String classSuffix = "}"; - final String methodPrefix = "SomeUnknownReturnType f() {"; - final String methodSuffix = "return noSuchReturnValue; }"; - - String originalContent = code; - String content = originalContent; - CompilationUnit parsed = null; - try { - parsed = JavaParser.parse(content); - } catch (ParseProblemException e1) { - // Wrap with a class and method - try { - content = classPrefix + methodPrefix + originalContent + methodSuffix + classSuffix; - parsed = JavaParser.parse(content); - } catch (ParseProblemException e2) { - // Wrap with a class only - content = classPrefix + originalContent + classSuffix; - parsed = JavaParser.parse(content); - } - } - - return parsed; - } - - public ArrayList generatePathFeatures(ArrayList methods) { - ArrayList methodsFeatures = new ArrayList<>(); - for (MethodContent content : methods) { - if (content.getLength() < m_CommandLineValues.MinCodeLength - || content.getLength() > m_CommandLineValues.MaxCodeLength) - continue; - ProgramFeatures singleMethodFeatures = generatePathFeaturesForFunction(content); - if (!singleMethodFeatures.isEmpty()) { - methodsFeatures.add(singleMethodFeatures); - } - } - return methodsFeatures; - } - - private ProgramFeatures generatePathFeaturesForFunction(MethodContent methodContent) { - ArrayList functionLeaves = methodContent.getLeaves(); - ProgramFeatures programFeatures = new ProgramFeatures(methodContent.getName()); - - for (int i = 0; i < functionLeaves.size(); i++) { - for (int j = i + 1; j < functionLeaves.size(); j++) { - String separator = Common.EmptyString; - - String path = generatePath(functionLeaves.get(i), functionLeaves.get(j), separator); - if (path != Common.EmptyString) { - Property source = functionLeaves.get(i).getUserData(Common.PropertyKey); - Property target = functionLeaves.get(j).getUserData(Common.PropertyKey); - programFeatures.addFeature(source, path, target); - } - } - } - return programFeatures; - } - - private static ArrayList getTreeStack(Node node) { - ArrayList upStack = new ArrayList<>(); - Node current = node; - while (current != null) { - upStack.add(current); - current = current.getParentNode(); - } - return upStack; - } - - private String generatePath(Node source, Node target, String separator) { - String down = downSymbol; - String up = upSymbol; - String startSymbol = lparen; - String endSymbol = rparen; - - StringJoiner stringBuilder = new StringJoiner(separator); - ArrayList sourceStack = getTreeStack(source); - ArrayList targetStack = getTreeStack(target); - - int commonPrefix = 0; - int currentSourceAncestorIndex = sourceStack.size() - 1; - int currentTargetAncestorIndex = targetStack.size() - 1; - while (currentSourceAncestorIndex >= 0 && currentTargetAncestorIndex >= 0 - && sourceStack.get(currentSourceAncestorIndex) == targetStack.get(currentTargetAncestorIndex)) { - commonPrefix++; - currentSourceAncestorIndex--; - currentTargetAncestorIndex--; - } - - int pathLength = sourceStack.size() + targetStack.size() - 2 * commonPrefix; - if (pathLength > m_CommandLineValues.MaxPathLength) { - return Common.EmptyString; - } - - if (currentSourceAncestorIndex >= 0 && currentTargetAncestorIndex >= 0) { - int pathWidth = targetStack.get(currentTargetAncestorIndex).getUserData(Common.ChildId) - - sourceStack.get(currentSourceAncestorIndex).getUserData(Common.ChildId); - if (pathWidth > m_CommandLineValues.MaxPathWidth) { - return Common.EmptyString; - } - } - - for (int i = 0; i < sourceStack.size() - commonPrefix; i++) { - Node currentNode = sourceStack.get(i); - String childId = Common.EmptyString; - String parentRawType = currentNode.getParentNode().getUserData(Common.PropertyKey).getRawType(); - if (i == 0 || s_ParentTypeToAddChildId.contains(parentRawType)) { - childId = saturateChildId(currentNode.getUserData(Common.ChildId)) - .toString(); - } - stringBuilder.add(String.format("%s%s%s%s%s", startSymbol, - currentNode.getUserData(Common.PropertyKey).getType(), childId, endSymbol, up)); - } - - Node commonNode = sourceStack.get(sourceStack.size() - commonPrefix); - String commonNodeChildId = Common.EmptyString; - Property parentNodeProperty = commonNode.getParentNode().getUserData(Common.PropertyKey); - String commonNodeParentRawType = Common.EmptyString; - if (parentNodeProperty != null) { - commonNodeParentRawType = parentNodeProperty.getRawType(); - } - if (s_ParentTypeToAddChildId.contains(commonNodeParentRawType)) { - commonNodeChildId = saturateChildId(commonNode.getUserData(Common.ChildId)) - .toString(); - } - stringBuilder.add(String.format("%s%s%s%s", startSymbol, - commonNode.getUserData(Common.PropertyKey).getType(), commonNodeChildId, endSymbol)); - - for (int i = targetStack.size() - commonPrefix - 1; i >= 0; i--) { - Node currentNode = targetStack.get(i); - String childId = Common.EmptyString; - if (i == 0 || s_ParentTypeToAddChildId.contains(currentNode.getUserData(Common.PropertyKey).getRawType())) { - childId = saturateChildId(currentNode.getUserData(Common.ChildId)) - .toString(); - } - stringBuilder.add(String.format("%s%s%s%s%s", down, startSymbol, - currentNode.getUserData(Common.PropertyKey).getType(), childId, endSymbol)); - } - - return stringBuilder.toString(); - } - - private Integer saturateChildId(int childId) { - return Math.min(childId, m_CommandLineValues.MaxChildId); - } -} +package JavaExtractor; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Set; +import java.util.StringJoiner; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import com.github.javaparser.JavaParser; +import com.github.javaparser.ParseException; +import com.github.javaparser.ParseProblemException; +import com.github.javaparser.ast.CompilationUnit; +import com.github.javaparser.ast.Node; +import JavaExtractor.Common.CommandLineValues; +import JavaExtractor.Common.Common; +import JavaExtractor.Common.MethodContent; +import JavaExtractor.FeaturesEntities.ProgramFeatures; +import JavaExtractor.FeaturesEntities.Property; +import JavaExtractor.Visitors.FunctionVisitor; + +@SuppressWarnings("StringEquality") +public class FeatureExtractor { + private CommandLineValues m_CommandLineValues; + private static Set s_ParentTypeToAddChildId = Stream + .of("AssignExpr", "ArrayAccessExpr", "FieldAccessExpr", "MethodCallExpr") + .collect(Collectors.toCollection(HashSet::new)); + + final static String lparen = "("; + final static String rparen = ")"; + final static String upSymbol = "^"; + final static String downSymbol = "_"; + + public FeatureExtractor(CommandLineValues commandLineValues) { + this.m_CommandLineValues = commandLineValues; + } + + public ArrayList extractFeatures(String code) throws ParseException, IOException { + CompilationUnit compilationUnit = parseFileWithRetries(code); + FunctionVisitor functionVisitor = new FunctionVisitor(); + + functionVisitor.visit(compilationUnit, null); + + ArrayList methods = functionVisitor.getMethodContents(); + ArrayList programs = generatePathFeatures(methods); + + return programs; + } + + private CompilationUnit parseFileWithRetries(String code) throws IOException { + final String classPrefix = "public class Test {"; + final String classSuffix = "}"; + final String methodPrefix = "SomeUnknownReturnType f() {"; + final String methodSuffix = "return noSuchReturnValue; }"; + + String originalContent = code; + String content = originalContent; + CompilationUnit parsed = null; + try { + parsed = JavaParser.parse(content); + } catch (ParseProblemException e1) { + // Wrap with a class and method + try { + content = classPrefix + methodPrefix + originalContent + methodSuffix + classSuffix; + parsed = JavaParser.parse(content); + } catch (ParseProblemException e2) { + // Wrap with a class only + content = classPrefix + originalContent + classSuffix; + parsed = JavaParser.parse(content); + } + } + + return parsed; + } + + public ArrayList generatePathFeatures(ArrayList methods) { + ArrayList methodsFeatures = new ArrayList<>(); + for (MethodContent content : methods) { + if (content.getLength() < m_CommandLineValues.MinCodeLength + || content.getLength() > m_CommandLineValues.MaxCodeLength) + continue; + ProgramFeatures singleMethodFeatures = generatePathFeaturesForFunction(content); + if (!singleMethodFeatures.isEmpty()) { + methodsFeatures.add(singleMethodFeatures); + } + } + return methodsFeatures; + } + + private ProgramFeatures generatePathFeaturesForFunction(MethodContent methodContent) { + ArrayList functionLeaves = methodContent.getLeaves(); + ProgramFeatures programFeatures = new ProgramFeatures(methodContent.getName()); + + for (int i = 0; i < functionLeaves.size(); i++) { + for (int j = i + 1; j < functionLeaves.size(); j++) { + String separator = Common.EmptyString; + + String path = generatePath(functionLeaves.get(i), functionLeaves.get(j), separator); + if (path != Common.EmptyString) { + Property source = functionLeaves.get(i).getUserData(Common.PropertyKey); + Property target = functionLeaves.get(j).getUserData(Common.PropertyKey); + programFeatures.addFeature(source, path, target); + } + } + } + return programFeatures; + } + + private static ArrayList getTreeStack(Node node) { + ArrayList upStack = new ArrayList<>(); + Node current = node; + while (current != null) { + upStack.add(current); + current = current.getParentNode(); + } + return upStack; + } + + private String generatePath(Node source, Node target, String separator) { + String down = downSymbol; + String up = upSymbol; + String startSymbol = lparen; + String endSymbol = rparen; + + StringJoiner stringBuilder = new StringJoiner(separator); + ArrayList sourceStack = getTreeStack(source); + ArrayList targetStack = getTreeStack(target); + + int commonPrefix = 0; + int currentSourceAncestorIndex = sourceStack.size() - 1; + int currentTargetAncestorIndex = targetStack.size() - 1; + while (currentSourceAncestorIndex >= 0 && currentTargetAncestorIndex >= 0 + && sourceStack.get(currentSourceAncestorIndex) == targetStack.get(currentTargetAncestorIndex)) { + commonPrefix++; + currentSourceAncestorIndex--; + currentTargetAncestorIndex--; + } + + int pathLength = sourceStack.size() + targetStack.size() - 2 * commonPrefix; + if (pathLength > m_CommandLineValues.MaxPathLength) { + return Common.EmptyString; + } + + if (currentSourceAncestorIndex >= 0 && currentTargetAncestorIndex >= 0) { + int pathWidth = targetStack.get(currentTargetAncestorIndex).getUserData(Common.ChildId) + - sourceStack.get(currentSourceAncestorIndex).getUserData(Common.ChildId); + if (pathWidth > m_CommandLineValues.MaxPathWidth) { + return Common.EmptyString; + } + } + + for (int i = 0; i < sourceStack.size() - commonPrefix; i++) { + Node currentNode = sourceStack.get(i); + String childId = Common.EmptyString; + String parentRawType = currentNode.getParentNode().getUserData(Common.PropertyKey).getRawType(); + if (i == 0 || s_ParentTypeToAddChildId.contains(parentRawType)) { + childId = saturateChildId(currentNode.getUserData(Common.ChildId)) + .toString(); + } + stringBuilder.add(String.format("%s%s%s%s%s", startSymbol, + currentNode.getUserData(Common.PropertyKey).getType(), childId, endSymbol, up)); + } + + Node commonNode = sourceStack.get(sourceStack.size() - commonPrefix); + String commonNodeChildId = Common.EmptyString; + Property parentNodeProperty = commonNode.getParentNode().getUserData(Common.PropertyKey); + String commonNodeParentRawType = Common.EmptyString; + if (parentNodeProperty != null) { + commonNodeParentRawType = parentNodeProperty.getRawType(); + } + if (s_ParentTypeToAddChildId.contains(commonNodeParentRawType)) { + commonNodeChildId = saturateChildId(commonNode.getUserData(Common.ChildId)) + .toString(); + } + stringBuilder.add(String.format("%s%s%s%s", startSymbol, + commonNode.getUserData(Common.PropertyKey).getType(), commonNodeChildId, endSymbol)); + + for (int i = targetStack.size() - commonPrefix - 1; i >= 0; i--) { + Node currentNode = targetStack.get(i); + String childId = Common.EmptyString; + if (i == 0 || s_ParentTypeToAddChildId.contains(currentNode.getUserData(Common.PropertyKey).getRawType())) { + childId = saturateChildId(currentNode.getUserData(Common.ChildId)) + .toString(); + } + stringBuilder.add(String.format("%s%s%s%s%s", down, startSymbol, + currentNode.getUserData(Common.PropertyKey).getType(), childId, endSymbol)); + } + + return stringBuilder.toString(); + } + + private Integer saturateChildId(int childId) { + return Math.min(childId, m_CommandLineValues.MaxChildId); + } +} diff --git a/JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/ProgramFeatures.java b/pipeline/JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/ProgramFeatures.java old mode 100755 new mode 100644 similarity index 95% rename from JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/ProgramFeatures.java rename to pipeline/JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/ProgramFeatures.java index 755c2ce..92c708f --- a/JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/ProgramFeatures.java +++ b/pipeline/JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/ProgramFeatures.java @@ -1,49 +1,49 @@ -package JavaExtractor.FeaturesEntities; - -import java.util.ArrayList; -import java.util.stream.Collectors; - -import com.fasterxml.jackson.annotation.JsonIgnore; - -public class ProgramFeatures { - private String name; - - private ArrayList features = new ArrayList<>(); - - public ProgramFeatures(String name) { - this.name = name; - } - - @SuppressWarnings("StringBufferReplaceableByString") - @Override - public String toString() { - StringBuilder stringBuilder = new StringBuilder(); - stringBuilder.append(name).append(" "); - stringBuilder.append(features.stream().map(ProgramRelation::toString).collect(Collectors.joining(" "))); - - return stringBuilder.toString(); - } - - public void addFeature(Property source, String path, Property target) { - ProgramRelation newRelation = new ProgramRelation(source, target, path); - features.add(newRelation); - } - - @JsonIgnore - public boolean isEmpty() { - return features.isEmpty(); - } - - public void deleteAllPaths() { - features.clear(); - } - - public String getName() { - return name; - } - - public ArrayList getFeatures() { - return features; - } - -} +package JavaExtractor.FeaturesEntities; + +import java.util.ArrayList; +import java.util.stream.Collectors; + +import com.fasterxml.jackson.annotation.JsonIgnore; + +public class ProgramFeatures { + private String name; + + private ArrayList features = new ArrayList<>(); + + public ProgramFeatures(String name) { + this.name = name; + } + + @SuppressWarnings("StringBufferReplaceableByString") + @Override + public String toString() { + StringBuilder stringBuilder = new StringBuilder(); + stringBuilder.append(name).append(" "); + stringBuilder.append(features.stream().map(ProgramRelation::toString).collect(Collectors.joining(" "))); + + return stringBuilder.toString(); + } + + public void addFeature(Property source, String path, Property target) { + ProgramRelation newRelation = new ProgramRelation(source, target, path); + features.add(newRelation); + } + + @JsonIgnore + public boolean isEmpty() { + return features.isEmpty(); + } + + public void deleteAllPaths() { + features.clear(); + } + + public String getName() { + return name; + } + + public ArrayList getFeatures() { + return features; + } + +} diff --git a/JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/ProgramNode.java b/pipeline/JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/ProgramNode.java old mode 100755 new mode 100644 similarity index 95% rename from JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/ProgramNode.java rename to pipeline/JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/ProgramNode.java index 998bd90..7cf13c0 --- a/JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/ProgramNode.java +++ b/pipeline/JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/ProgramNode.java @@ -1,21 +1,21 @@ -package JavaExtractor.FeaturesEntities; - -import java.io.UnsupportedEncodingException; -import java.net.URLEncoder; -import JavaExtractor.Common.Common; - -public class ProgramNode { - public int Id; - public String Type; - public String Name; - public boolean IsMethodDeclarationName; - - public ProgramNode(String name) { - Name = name; - try { - Name = URLEncoder.encode(name, Common.UTF8); - } catch (UnsupportedEncodingException e) { - e.printStackTrace(); - } - } -} +package JavaExtractor.FeaturesEntities; + +import java.io.UnsupportedEncodingException; +import java.net.URLEncoder; +import JavaExtractor.Common.Common; + +public class ProgramNode { + public int Id; + public String Type; + public String Name; + public boolean IsMethodDeclarationName; + + public ProgramNode(String name) { + Name = name; + try { + Name = URLEncoder.encode(name, Common.UTF8); + } catch (UnsupportedEncodingException e) { + e.printStackTrace(); + } + } +} diff --git a/JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/ProgramRelation.java b/pipeline/JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/ProgramRelation.java old mode 100755 new mode 100644 similarity index 95% rename from JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/ProgramRelation.java rename to pipeline/JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/ProgramRelation.java index b168d62..47d543f --- a/JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/ProgramRelation.java +++ b/pipeline/JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/ProgramRelation.java @@ -1,56 +1,56 @@ -package JavaExtractor.FeaturesEntities; - -import java.util.ArrayList; -import java.util.function.Function; - -import com.fasterxml.jackson.annotation.JsonIgnore; -import com.fasterxml.jackson.annotation.JsonIgnoreProperties; -import com.fasterxml.jackson.annotation.JsonPropertyDescription; - -public class ProgramRelation { - private Property m_Source; - private Property m_Target; - private String m_HashedPath; - private String m_Path; - @SuppressWarnings("FieldCanBeLocal") - @JsonPropertyDescription - private ArrayList result; - public static Function s_Hasher = (s) -> Integer.toString(s.hashCode()); - - public ProgramRelation(Property sourceName, Property targetName, String path) { - m_Source = sourceName; - m_Target = targetName; - m_Path = path; - m_HashedPath = s_Hasher.apply(path); - } - - public static void setNoHash() { - s_Hasher = (s) -> s; - } - - public String toString() { - return String.format("%s,%s,%s", m_Source.getName(), m_HashedPath, - m_Target.getName()); - } - - @JsonIgnore - public String getPath() { - return m_Path; - } - - @JsonIgnore - public Property getSource() { - return m_Source; - } - - @JsonIgnoreProperties - public Property getTarget() { - return m_Target; - } - - @JsonIgnore - public String getHashedPath() { - return m_HashedPath; - - } -} +package JavaExtractor.FeaturesEntities; + +import java.util.ArrayList; +import java.util.function.Function; + +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import com.fasterxml.jackson.annotation.JsonPropertyDescription; + +public class ProgramRelation { + private Property m_Source; + private Property m_Target; + private String m_HashedPath; + private String m_Path; + @SuppressWarnings("FieldCanBeLocal") + @JsonPropertyDescription + private ArrayList result; + public static Function s_Hasher = (s) -> Integer.toString(s.hashCode()); + + public ProgramRelation(Property sourceName, Property targetName, String path) { + m_Source = sourceName; + m_Target = targetName; + m_Path = path; + m_HashedPath = s_Hasher.apply(path); + } + + public static void setNoHash() { + s_Hasher = (s) -> s; + } + + public String toString() { + return String.format("%s,%s,%s", m_Source.getName(), m_HashedPath, + m_Target.getName()); + } + + @JsonIgnore + public String getPath() { + return m_Path; + } + + @JsonIgnore + public Property getSource() { + return m_Source; + } + + @JsonIgnoreProperties + public Property getTarget() { + return m_Target; + } + + @JsonIgnore + public String getHashedPath() { + return m_HashedPath; + + } +} diff --git a/JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/Property.java b/pipeline/JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/Property.java old mode 100755 new mode 100644 similarity index 96% rename from JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/Property.java rename to pipeline/JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/Property.java index 697acc7..3718341 --- a/JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/Property.java +++ b/pipeline/JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/Property.java @@ -1,90 +1,90 @@ -package JavaExtractor.FeaturesEntities; - -import java.util.ArrayList; -import java.util.HashSet; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -import com.github.javaparser.ast.Node; -import com.github.javaparser.ast.expr.AssignExpr; -import com.github.javaparser.ast.expr.BinaryExpr; -import com.github.javaparser.ast.expr.IntegerLiteralExpr; -import com.github.javaparser.ast.expr.UnaryExpr; -import com.github.javaparser.ast.type.ClassOrInterfaceType; - -import JavaExtractor.Common.Common; - -public class Property { - private String RawType; - private String Type; - private String Name; - private String SplitName; - private String Operator; - public static final HashSet NumericalKeepValues = Stream.of("0", "1", "32", "64") - .collect(Collectors.toCollection(HashSet::new)); - - public Property(Node node, boolean isLeaf, boolean isGenericParent, int id) { - Class nodeClass = node.getClass(); - RawType = Type = nodeClass.getSimpleName(); - if (node instanceof ClassOrInterfaceType && ((ClassOrInterfaceType) node).isBoxedType()) { - Type = "PrimitiveType"; - } - Operator = ""; - if (node instanceof BinaryExpr) { - Operator = ((BinaryExpr) node).getOperator().toString(); - } else if (node instanceof UnaryExpr) { - Operator = ((UnaryExpr) node).getOperator().toString(); - } else if (node instanceof AssignExpr) { - Operator = ((AssignExpr) node).getOperator().toString(); - } - if (Operator.length() > 0) { - Type += ":" + Operator; - } - - String nameToSplit = node.toString(); - if (isGenericParent) { - nameToSplit = ((ClassOrInterfaceType) node).getName(); - if (isLeaf) { - // if it is a generic parent which counts as a leaf, then when - // it is participating in a path - // as a parent, it should be GenericClass and not a simple - // ClassOrInterfaceType. - Type = "GenericClass"; - } - } - ArrayList splitNameParts = Common.splitToSubtokens(nameToSplit); - SplitName = splitNameParts.stream().collect(Collectors.joining(Common.internalSeparator)); - - node.toString(); - Name = Common.normalizeName(node.toString(), Common.BlankWord); - if (Name.length() > Common.c_MaxLabelLength) { - Name = Name.substring(0, Common.c_MaxLabelLength); - } else if (node instanceof ClassOrInterfaceType && ((ClassOrInterfaceType) node).isBoxedType()) { - Name = ((ClassOrInterfaceType) node).toUnboxedType().toString(); - } - - if (Common.isMethod(node, Type)) { - Name = SplitName = Common.methodName; - } - - if (SplitName.length() == 0) { - SplitName = Name; - if (node instanceof IntegerLiteralExpr && !NumericalKeepValues.contains(SplitName)) { - // This is a numeric literal, but not in our white list - SplitName = ""; - } - } - } - - public String getRawType() { - return RawType; - } - - public String getType() { - return Type; - } - - public String getName() { - return Name; - } -} +package JavaExtractor.FeaturesEntities; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import com.github.javaparser.ast.Node; +import com.github.javaparser.ast.expr.AssignExpr; +import com.github.javaparser.ast.expr.BinaryExpr; +import com.github.javaparser.ast.expr.IntegerLiteralExpr; +import com.github.javaparser.ast.expr.UnaryExpr; +import com.github.javaparser.ast.type.ClassOrInterfaceType; + +import JavaExtractor.Common.Common; + +public class Property { + private String RawType; + private String Type; + private String Name; + private String SplitName; + private String Operator; + public static final HashSet NumericalKeepValues = Stream.of("0", "1", "32", "64") + .collect(Collectors.toCollection(HashSet::new)); + + public Property(Node node, boolean isLeaf, boolean isGenericParent, int id) { + Class nodeClass = node.getClass(); + RawType = Type = nodeClass.getSimpleName(); + if (node instanceof ClassOrInterfaceType && ((ClassOrInterfaceType) node).isBoxedType()) { + Type = "PrimitiveType"; + } + Operator = ""; + if (node instanceof BinaryExpr) { + Operator = ((BinaryExpr) node).getOperator().toString(); + } else if (node instanceof UnaryExpr) { + Operator = ((UnaryExpr) node).getOperator().toString(); + } else if (node instanceof AssignExpr) { + Operator = ((AssignExpr) node).getOperator().toString(); + } + if (Operator.length() > 0) { + Type += ":" + Operator; + } + + String nameToSplit = node.toString(); + if (isGenericParent) { + nameToSplit = ((ClassOrInterfaceType) node).getName(); + if (isLeaf) { + // if it is a generic parent which counts as a leaf, then when + // it is participating in a path + // as a parent, it should be GenericClass and not a simple + // ClassOrInterfaceType. + Type = "GenericClass"; + } + } + ArrayList splitNameParts = Common.splitToSubtokens(nameToSplit); + SplitName = splitNameParts.stream().collect(Collectors.joining(Common.internalSeparator)); + + node.toString(); + Name = Common.normalizeName(node.toString(), Common.BlankWord); + if (Name.length() > Common.c_MaxLabelLength) { + Name = Name.substring(0, Common.c_MaxLabelLength); + } else if (node instanceof ClassOrInterfaceType && ((ClassOrInterfaceType) node).isBoxedType()) { + Name = ((ClassOrInterfaceType) node).toUnboxedType().toString(); + } + + if (Common.isMethod(node, Type)) { + Name = SplitName = Common.methodName; + } + + if (SplitName.length() == 0) { + SplitName = Name; + if (node instanceof IntegerLiteralExpr && !NumericalKeepValues.contains(SplitName)) { + // This is a numeric literal, but not in our white list + SplitName = ""; + } + } + } + + public String getRawType() { + return RawType; + } + + public String getType() { + return Type; + } + + public String getName() { + return Name; + } +} diff --git a/JavaExtractor/JPredict/src/main/java/JavaExtractor/Visitors/FunctionVisitor.java b/pipeline/JavaExtractor/JPredict/src/main/java/JavaExtractor/Visitors/FunctionVisitor.java old mode 100755 new mode 100644 similarity index 97% rename from JavaExtractor/JPredict/src/main/java/JavaExtractor/Visitors/FunctionVisitor.java rename to pipeline/JavaExtractor/JPredict/src/main/java/JavaExtractor/Visitors/FunctionVisitor.java index e706e25..5703e0e --- a/JavaExtractor/JPredict/src/main/java/JavaExtractor/Visitors/FunctionVisitor.java +++ b/pipeline/JavaExtractor/JPredict/src/main/java/JavaExtractor/Visitors/FunctionVisitor.java @@ -1,60 +1,60 @@ -package JavaExtractor.Visitors; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.stream.Collectors; - -import com.github.javaparser.ast.Node; -import com.github.javaparser.ast.body.MethodDeclaration; -import com.github.javaparser.ast.visitor.VoidVisitorAdapter; - -import JavaExtractor.Common.Common; -import JavaExtractor.Common.MethodContent; - -@SuppressWarnings("StringEquality") -public class FunctionVisitor extends VoidVisitorAdapter { - private ArrayList m_Methods = new ArrayList<>(); - - @Override - public void visit(MethodDeclaration node, Object arg) { - visitMethod(node, arg); - - super.visit(node, arg); - } - - private void visitMethod(MethodDeclaration node, Object obj) { - LeavesCollectorVisitor leavesCollectorVisitor = new LeavesCollectorVisitor(); - leavesCollectorVisitor.visitDepthFirst(node); - ArrayList leaves = leavesCollectorVisitor.getLeaves(); - - String normalizedMethodName = Common.normalizeName(node.getName(), Common.BlankWord); - ArrayList splitNameParts = Common.splitToSubtokens(node.getName()); - String splitName = normalizedMethodName; - if (splitNameParts.size() > 0) { - splitName = splitNameParts.stream().collect(Collectors.joining(Common.internalSeparator)); - } - - if (node.getBody() != null) { - m_Methods.add(new MethodContent(leaves, splitName, getMethodLength(node.getBody().toString()))); - } - } - - private long getMethodLength(String code) { - String cleanCode = code.replaceAll("\r\n", "\n").replaceAll("\t", " "); - if (cleanCode.startsWith("{\n")) - cleanCode = cleanCode.substring(3).trim(); - if (cleanCode.endsWith("\n}")) - cleanCode = cleanCode.substring(0, cleanCode.length() - 2).trim(); - if (cleanCode.length() == 0) { - return 0; - } - long codeLength = Arrays.asList(cleanCode.split("\n")).stream() - .filter(line -> (line.trim() != "{" && line.trim() != "}" && line.trim() != "")) - .filter(line -> !line.trim().startsWith("/") && !line.trim().startsWith("*")).count(); - return codeLength; - } - - public ArrayList getMethodContents() { - return m_Methods; - } -} +package JavaExtractor.Visitors; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.stream.Collectors; + +import com.github.javaparser.ast.Node; +import com.github.javaparser.ast.body.MethodDeclaration; +import com.github.javaparser.ast.visitor.VoidVisitorAdapter; + +import JavaExtractor.Common.Common; +import JavaExtractor.Common.MethodContent; + +@SuppressWarnings("StringEquality") +public class FunctionVisitor extends VoidVisitorAdapter { + private ArrayList m_Methods = new ArrayList<>(); + + @Override + public void visit(MethodDeclaration node, Object arg) { + visitMethod(node, arg); + + super.visit(node, arg); + } + + private void visitMethod(MethodDeclaration node, Object obj) { + LeavesCollectorVisitor leavesCollectorVisitor = new LeavesCollectorVisitor(); + leavesCollectorVisitor.visitDepthFirst(node); + ArrayList leaves = leavesCollectorVisitor.getLeaves(); + + String normalizedMethodName = Common.normalizeName(node.getName(), Common.BlankWord); + ArrayList splitNameParts = Common.splitToSubtokens(node.getName()); + String splitName = normalizedMethodName; + if (splitNameParts.size() > 0) { + splitName = splitNameParts.stream().collect(Collectors.joining(Common.internalSeparator)); + } + + if (node.getBody() != null) { + m_Methods.add(new MethodContent(leaves, splitName, getMethodLength(node.getBody().toString()))); + } + } + + private long getMethodLength(String code) { + String cleanCode = code.replaceAll("\r\n", "\n").replaceAll("\t", " "); + if (cleanCode.startsWith("{\n")) + cleanCode = cleanCode.substring(3).trim(); + if (cleanCode.endsWith("\n}")) + cleanCode = cleanCode.substring(0, cleanCode.length() - 2).trim(); + if (cleanCode.length() == 0) { + return 0; + } + long codeLength = Arrays.asList(cleanCode.split("\n")).stream() + .filter(line -> (line.trim() != "{" && line.trim() != "}" && line.trim() != "")) + .filter(line -> !line.trim().startsWith("/") && !line.trim().startsWith("*")).count(); + return codeLength; + } + + public ArrayList getMethodContents() { + return m_Methods; + } +} diff --git a/JavaExtractor/JPredict/src/main/java/JavaExtractor/Visitors/LeavesCollectorVisitor.java b/pipeline/JavaExtractor/JPredict/src/main/java/JavaExtractor/Visitors/LeavesCollectorVisitor.java old mode 100755 new mode 100644 similarity index 96% rename from JavaExtractor/JPredict/src/main/java/JavaExtractor/Visitors/LeavesCollectorVisitor.java rename to pipeline/JavaExtractor/JPredict/src/main/java/JavaExtractor/Visitors/LeavesCollectorVisitor.java index 3430b89..a101d0d --- a/JavaExtractor/JPredict/src/main/java/JavaExtractor/Visitors/LeavesCollectorVisitor.java +++ b/pipeline/JavaExtractor/JPredict/src/main/java/JavaExtractor/Visitors/LeavesCollectorVisitor.java @@ -1,69 +1,69 @@ -package JavaExtractor.Visitors; - -import java.util.ArrayList; -import java.util.List; -import com.github.javaparser.ast.Node; -import com.github.javaparser.ast.comments.Comment; -import com.github.javaparser.ast.expr.NullLiteralExpr; -import com.github.javaparser.ast.stmt.Statement; -import com.github.javaparser.ast.type.ClassOrInterfaceType; -import com.github.javaparser.ast.visitor.TreeVisitor; - -import JavaExtractor.Common.Common; -import JavaExtractor.FeaturesEntities.Property; - -public class LeavesCollectorVisitor extends TreeVisitor { - ArrayList m_Leaves = new ArrayList<>(); - private int currentId = 1; - - @Override - public void process(Node node) { - if (node instanceof Comment) { - return; - } - boolean isLeaf = false; - boolean isGenericParent = isGenericParent(node); - if (hasNoChildren(node) && isNotComment(node)) { - if (!node.toString().isEmpty() && (!"null".equals(node.toString()) || (node instanceof NullLiteralExpr))) { - m_Leaves.add(node); - isLeaf = true; - } - } - - int childId = getChildId(node); - node.setUserData(Common.ChildId, childId); - Property property = new Property(node, isLeaf, isGenericParent, currentId++); - node.setUserData(Common.PropertyKey, property); - } - - private boolean isGenericParent(Node node) { - return (node instanceof ClassOrInterfaceType) - && ((ClassOrInterfaceType)node).getTypeArguments() != null - && ((ClassOrInterfaceType)node).getTypeArguments().size() > 0; - } - - private boolean hasNoChildren(Node node) { - return node.getChildrenNodes().size() == 0; - } - - private boolean isNotComment(Node node) { - return !(node instanceof Comment) && !(node instanceof Statement); - } - - public ArrayList getLeaves() { - return m_Leaves; - } - - private int getChildId(Node node) { - Node parent = node.getParentNode(); - List parentsChildren = parent.getChildrenNodes(); - int childId = 0; - for (Node child: parentsChildren) { - if (child.getRange().equals(node.getRange())) { - return childId; - } - childId++; - } - return childId; - } -} +package JavaExtractor.Visitors; + +import java.util.ArrayList; +import java.util.List; +import com.github.javaparser.ast.Node; +import com.github.javaparser.ast.comments.Comment; +import com.github.javaparser.ast.expr.NullLiteralExpr; +import com.github.javaparser.ast.stmt.Statement; +import com.github.javaparser.ast.type.ClassOrInterfaceType; +import com.github.javaparser.ast.visitor.TreeVisitor; + +import JavaExtractor.Common.Common; +import JavaExtractor.FeaturesEntities.Property; + +public class LeavesCollectorVisitor extends TreeVisitor { + ArrayList m_Leaves = new ArrayList<>(); + private int currentId = 1; + + @Override + public void process(Node node) { + if (node instanceof Comment) { + return; + } + boolean isLeaf = false; + boolean isGenericParent = isGenericParent(node); + if (hasNoChildren(node) && isNotComment(node)) { + if (!node.toString().isEmpty() && (!"null".equals(node.toString()) || (node instanceof NullLiteralExpr))) { + m_Leaves.add(node); + isLeaf = true; + } + } + + int childId = getChildId(node); + node.setUserData(Common.ChildId, childId); + Property property = new Property(node, isLeaf, isGenericParent, currentId++); + node.setUserData(Common.PropertyKey, property); + } + + private boolean isGenericParent(Node node) { + return (node instanceof ClassOrInterfaceType) + && ((ClassOrInterfaceType)node).getTypeArguments() != null + && ((ClassOrInterfaceType)node).getTypeArguments().size() > 0; + } + + private boolean hasNoChildren(Node node) { + return node.getChildrenNodes().size() == 0; + } + + private boolean isNotComment(Node node) { + return !(node instanceof Comment) && !(node instanceof Statement); + } + + public ArrayList getLeaves() { + return m_Leaves; + } + + private int getChildId(Node node) { + Node parent = node.getParentNode(); + List parentsChildren = parent.getChildrenNodes(); + int childId = 0; + for (Node child: parentsChildren) { + if (child.getRange().equals(node.getRange())) { + return childId; + } + childId++; + } + return childId; + } +} diff --git a/JavaExtractor/JPredict/src/main/java/Test.java b/pipeline/JavaExtractor/JPredict/src/main/java/Test.java old mode 100755 new mode 100644 similarity index 96% rename from JavaExtractor/JPredict/src/main/java/Test.java rename to pipeline/JavaExtractor/JPredict/src/main/java/Test.java index d8aad73..59c13f4 --- a/JavaExtractor/JPredict/src/main/java/Test.java +++ b/pipeline/JavaExtractor/JPredict/src/main/java/Test.java @@ -1,3 +1,3 @@ -void fooBar() { - System.out.println("Hello World"); +void fooBar() { + System.out.println("Hello World"); } \ No newline at end of file diff --git a/JavaExtractor/extract.py b/pipeline/JavaExtractor/extract.py old mode 100755 new mode 100644 similarity index 100% rename from JavaExtractor/extract.py rename to pipeline/JavaExtractor/extract.py diff --git a/PathContextReader.py b/pipeline/PathContextReader.py old mode 100755 new mode 100644 similarity index 100% rename from PathContextReader.py rename to pipeline/PathContextReader.py diff --git a/agg_functions.py b/pipeline/agg_functions.py similarity index 100% rename from agg_functions.py rename to pipeline/agg_functions.py diff --git a/aggregation_pipeline.py b/pipeline/aggregation_pipeline.py similarity index 96% rename from aggregation_pipeline.py rename to pipeline/aggregation_pipeline.py index 8e20a99..2012fe6 100644 --- a/aggregation_pipeline.py +++ b/pipeline/aggregation_pipeline.py @@ -1,47 +1,47 @@ -from selection_methods import SelectAll -from agg_functions import VectorMean -from output_formats import ARFFFile -from reduction_methods import NoReduction -import numpy as np - -class AggregationPipeline: - - def __init__(self, dataset_name, model_name, selection_method = SelectAll, agg_function = VectorMean, reduction_method = NoReduction, output_format = ARFFFile): - self.dataset_name = dataset_name - self.model_name = model_name - self.selection_method = selection_method - self.agg_function = agg_function - self.reduction_method = reduction_method - self.output_format = output_format - - def aggregate_vectors(self, vectors): - if len(vectors) > 0: - # Select only the vectors we want to select - selector = self.selection_method(vectors) - selected_vectors = selector.select() - - # Aggregate them into a single vector - aggregator = self.agg_function(selected_vectors) - single_vector = aggregator.aggregate() - - return single_vector - else: - return [] - - def process_dataset(self, df): - # Reduce the dimensionality of the dataset - reducer = self.reduction_method(df) - reduced_dim_df = reducer.reduce() - - # Output it in the desired format - wf = ARFFFile( - self.dataset_name, - self.model_name, - self.selection_method.name(), - self.agg_function.name(), - self.reduction_method.name(), - reduced_dim_df) - wf.write_to_file() - - - +from selection_methods import SelectAll +from agg_functions import VectorMean +from output_formats import ARFFFile +from reduction_methods import NoReduction +import numpy as np + +class AggregationPipeline: + + def __init__(self, dataset_name, model_name, selection_method = SelectAll, agg_function = VectorMean, reduction_method = NoReduction, output_format = ARFFFile): + self.dataset_name = dataset_name + self.model_name = model_name + self.selection_method = selection_method + self.agg_function = agg_function + self.reduction_method = reduction_method + self.output_format = output_format + + def aggregate_vectors(self, vectors): + if len(vectors) > 0: + # Select only the vectors we want to select + selector = self.selection_method(vectors) + selected_vectors = selector.select() + + # Aggregate them into a single vector + aggregator = self.agg_function(selected_vectors) + single_vector = aggregator.aggregate() + + return single_vector + else: + return [] + + def process_dataset(self, df): + # Reduce the dimensionality of the dataset + reducer = self.reduction_method(df) + reduced_dim_df = reducer.reduce() + + # Output it in the desired format + wf = ARFFFile( + self.dataset_name, + self.model_name, + self.selection_method.name(), + self.agg_function.name(), + self.reduction_method.name(), + reduced_dim_df) + wf.write_to_file() + + + diff --git a/common.py b/pipeline/common.py old mode 100755 new mode 100644 similarity index 100% rename from common.py rename to pipeline/common.py diff --git a/extractor.py b/pipeline/extractor.py similarity index 100% rename from extractor.py rename to pipeline/extractor.py diff --git a/file2vec.py b/pipeline/file2vec.py similarity index 99% rename from file2vec.py rename to pipeline/file2vec.py index 088813a..540dfd0 100644 --- a/file2vec.py +++ b/pipeline/file2vec.py @@ -5,6 +5,7 @@ import time import random import re +import json from selection_methods import selection_methods from agg_functions import all_func @@ -25,7 +26,7 @@ CLASS_PREPROCESS_JAR_PATH = 'java-tool.jar' tmp_file_name = "tmpsnippet.java" -debug=False +debug=True class File2Vec: @@ -100,6 +101,7 @@ def create_file_vectors(self): try: predict_lines, hash_to_string_dict = self.path_extractor.extract_paths(tmp_file_name) except ValueError as e: + print(e) if debug: print("Error for method {} in file {}".format(method, file)) continue @@ -117,7 +119,7 @@ def create_file_vectors(self): print(fileNum, file, "Time:", time.time() - time0) fileNum += 1 - #os.remove(tmp_file_name) + os.remove(tmp_file_name) return file_vectors @@ -199,7 +201,6 @@ def run_pipeline(self, file_vectors): else: print(file_) - # Set up the dataframe values to hold the resulting dataset columns = self.get_num_columns(file_vectors) diff --git a/java_files/README.md b/pipeline/java_files/README.md similarity index 100% rename from java_files/README.md rename to pipeline/java_files/README.md diff --git a/main.py b/pipeline/main.py similarity index 100% rename from main.py rename to pipeline/main.py diff --git a/model.py b/pipeline/model.py old mode 100755 new mode 100644 similarity index 100% rename from model.py rename to pipeline/model.py diff --git a/model_defs.py b/pipeline/model_defs.py similarity index 84% rename from model_defs.py rename to pipeline/model_defs.py index 2448c16..9e541be 100644 --- a/model_defs.py +++ b/pipeline/model_defs.py @@ -5,7 +5,7 @@ 'args': "" }, { - 'location': 'models/random/saved_model_iter2', + 'location': 'models/standard/saved_model_iter2', 'name': 'random', 'args': "-r" }, diff --git a/models/README.md b/pipeline/models/README.md similarity index 100% rename from models/README.md rename to pipeline/models/README.md diff --git a/output_formats.py b/pipeline/output_formats.py similarity index 100% rename from output_formats.py rename to pipeline/output_formats.py diff --git a/reduction_methods.py b/pipeline/reduction_methods.py similarity index 95% rename from reduction_methods.py rename to pipeline/reduction_methods.py index 2b68ed0..e126c0b 100644 --- a/reduction_methods.py +++ b/pipeline/reduction_methods.py @@ -1,87 +1,87 @@ -import time -import numpy as np -import pandas as pd -import umap - - -class AbstractReductionMethod: - - def __init__(self, df): - self.df = df - - def reduce(): - raise NotImplementedError - - -class NoReduction(AbstractReductionMethod): - def __init__(self, df): - return super().__init__(df) - - def reduce(self): - return self.df - - @staticmethod - def name(): - return "none" - -class AbstractUMap(AbstractReductionMethod): - def __init__(self, df, k): - # We can't have K be more than the # of instances - if k > df.shape[0] - 2: - k = df.shape[0] - 2 - - self.k = k - return super().__init__(df) - - def reduce(self): - reducer = umap.UMAP(n_components=self.k) - - embedding = reducer.fit_transform(self.df.iloc[:,:-2]) - - # Convert the embedding (numpy array) back into a dataframe - feat_cols = ["x{}".format(i) for i in range(embedding.shape[1])] - - return_df = pd.DataFrame(data=embedding, columns = feat_cols) - return_df['filename'] = self.df['filename'] - return_df['class_val'] = self.df['class_val'] - return return_df - -class UMapReduction25(AbstractUMap): - def __init__(self, df): - return super().__init__(df, 25) - - @staticmethod - def name(): - return "UMap25" - -class UMapReduction50(AbstractUMap): - def __init__(self, df): - return super().__init__(df, 50) - - @staticmethod - def name(): - return "UMap50" - -class UMapReduction100(AbstractUMap): - def __init__(self, df): - return super().__init__(df, 100) - - @staticmethod - def name(): - return "UMap100" - -class UMapReduction250(AbstractUMap): - def __init__(self, df): - return super().__init__(df, 250) - - @staticmethod - def name(): - return "UMap250" - -reduction_methods = [ - NoReduction, - UMapReduction25, - UMapReduction50, - UMapReduction100, - UMapReduction250 -] +import time +import numpy as np +import pandas as pd +import umap + + +class AbstractReductionMethod: + + def __init__(self, df): + self.df = df + + def reduce(): + raise NotImplementedError + + +class NoReduction(AbstractReductionMethod): + def __init__(self, df): + return super().__init__(df) + + def reduce(self): + return self.df + + @staticmethod + def name(): + return "none" + +class AbstractUMap(AbstractReductionMethod): + def __init__(self, df, k): + # We can't have K be more than the # of instances + if k > df.shape[0] - 2: + k = df.shape[0] - 2 + + self.k = k + return super().__init__(df) + + def reduce(self): + reducer = umap.UMAP(n_components=self.k) + + embedding = reducer.fit_transform(self.df.iloc[:,:-2]) + + # Convert the embedding (numpy array) back into a dataframe + feat_cols = ["x{}".format(i) for i in range(embedding.shape[1])] + + return_df = pd.DataFrame(data=embedding, columns = feat_cols) + return_df['filename'] = self.df['filename'] + return_df['class_val'] = self.df['class_val'] + return return_df + +class UMapReduction25(AbstractUMap): + def __init__(self, df): + return super().__init__(df, 25) + + @staticmethod + def name(): + return "UMap25" + +class UMapReduction50(AbstractUMap): + def __init__(self, df): + return super().__init__(df, 50) + + @staticmethod + def name(): + return "UMap50" + +class UMapReduction100(AbstractUMap): + def __init__(self, df): + return super().__init__(df, 100) + + @staticmethod + def name(): + return "UMap100" + +class UMapReduction250(AbstractUMap): + def __init__(self, df): + return super().__init__(df, 250) + + @staticmethod + def name(): + return "UMap250" + +reduction_methods = [ + NoReduction, + UMapReduction25, + UMapReduction50, + UMapReduction100, + UMapReduction250 +] diff --git a/pipeline/requirements.txt b/pipeline/requirements.txt new file mode 100644 index 0000000..45c635d --- /dev/null +++ b/pipeline/requirements.txt @@ -0,0 +1,4 @@ +numpy +pandas +tensorflow==1.14.0 +umap-learn \ No newline at end of file diff --git a/scripts/copy_to_new_folder.py b/pipeline/scripts/copy_to_new_folder.py similarity index 100% rename from scripts/copy_to_new_folder.py rename to pipeline/scripts/copy_to_new_folder.py diff --git a/scripts/create_datasets.sh b/pipeline/scripts/create_datasets.sh similarity index 90% rename from scripts/create_datasets.sh rename to pipeline/scripts/create_datasets.sh index d2b744f..889c053 100644 --- a/scripts/create_datasets.sh +++ b/pipeline/scripts/create_datasets.sh @@ -1,9 +1,7 @@ # Using the models specified in model_defs, run each one on the datasets in java_files # This creates the .arff files to be used in the weka experimenter in the weka_files folder -NUM_MODELS=6 - -conda activate tf_new +NUM_MODELS=2 counter=0 while [ $counter -lt $NUM_MODELS ] diff --git a/scripts/weka_exp_results_processor.py b/pipeline/scripts/weka_exp_results_processor.py similarity index 100% rename from scripts/weka_exp_results_processor.py rename to pipeline/scripts/weka_exp_results_processor.py diff --git a/selection_methods.py b/pipeline/selection_methods.py similarity index 94% rename from selection_methods.py rename to pipeline/selection_methods.py index 9767e00..89918bf 100644 --- a/selection_methods.py +++ b/pipeline/selection_methods.py @@ -1,135 +1,135 @@ -import random -import numpy as np - -class AbstractSelectionMethod: - - def __init__(self, vectors, max_num): - self.vectors = vectors - - if len(self.vectors) < max_num: - max_num = len(self.vectors) - self.max_num = max_num - - def select(self, vectors): - raise NotImplementedError - - -class SelectAll(AbstractSelectionMethod): - - def __init__(self, vectors): - super().__init__(vectors, 999) - - def select(self): - return [x['vector'] for x in self.vectors] - - @staticmethod - def name(): - return "all" - -class SelectRandomK(AbstractSelectionMethod): - def __init__(self, vectors, k): - super().__init__(vectors, k) - - def select(self): - return [x['vector'] for x in random.sample(self.vectors, self.max_num)] - - @staticmethod - def name(): - raise NotImplementedError - -class SelectRandom1(SelectRandomK): - - def __init__(self, vectors): - super().__init__(vectors, 1) - - @staticmethod - def name(): - return "rand1" - -class SelectRandom2(SelectRandomK): - - def __init__(self, vectors): - super().__init__(vectors, 2) - - @staticmethod - def name(): - return "rand2" - -class SelectRandom3(SelectRandomK): - - def __init__(self, vectors): - super().__init__(vectors, 3) - - @staticmethod - def name(): - return "rand3" - -class SelectRandom5(SelectRandomK): - - def __init__(self, vectors): - super().__init__(vectors, 5) - - @staticmethod - def name(): - return "rand5" - -class SelectTopK(AbstractSelectionMethod): - - def __init__(self, vectors, k): - super().__init__(vectors, k) - - def select(self): - self.vectors.sort(key=lambda x: x['length'], reverse=True) - return [x['vector'] for x in self.vectors[:self.max_num]] - - @staticmethod - def name(): - raise NotImplementedError - -class SelectTop1(SelectTopK): - - def __init__(self, vectors): - super().__init__(vectors, 1) - - @staticmethod - def name(): - return "top1" - -class SelectTop2(SelectTopK): - - def __init__(self, vectors): - super().__init__(vectors, 2) - - @staticmethod - def name(): - return "top2" - -class SelectTop3(SelectTopK): - - def __init__(self, vectors): - super().__init__(vectors, 3) - - @staticmethod - def name(): - return "top3" - -class SelectTop5(SelectTopK): - - def __init__(self, vectors): - super().__init__(vectors, 5) - - @staticmethod - def name(): - return "top5" - -selection_methods = [ - SelectAll, - SelectRandom1, - SelectRandom2, - SelectRandom3, - SelectRandom5, - SelectTop1, - SelectTop2, - SelectTop3, - SelectTop5 -] +import random +import numpy as np + +class AbstractSelectionMethod: + + def __init__(self, vectors, max_num): + self.vectors = vectors + + if len(self.vectors) < max_num: + max_num = len(self.vectors) + self.max_num = max_num + + def select(self, vectors): + raise NotImplementedError + + +class SelectAll(AbstractSelectionMethod): + + def __init__(self, vectors): + super().__init__(vectors, 999) + + def select(self): + return [x['vector'] for x in self.vectors] + + @staticmethod + def name(): + return "all" + +class SelectRandomK(AbstractSelectionMethod): + def __init__(self, vectors, k): + super().__init__(vectors, k) + + def select(self): + return [x['vector'] for x in random.sample(self.vectors, self.max_num)] + + @staticmethod + def name(): + raise NotImplementedError + +class SelectRandom1(SelectRandomK): + + def __init__(self, vectors): + super().__init__(vectors, 1) + + @staticmethod + def name(): + return "rand1" + +class SelectRandom2(SelectRandomK): + + def __init__(self, vectors): + super().__init__(vectors, 2) + + @staticmethod + def name(): + return "rand2" + +class SelectRandom3(SelectRandomK): + + def __init__(self, vectors): + super().__init__(vectors, 3) + + @staticmethod + def name(): + return "rand3" + +class SelectRandom5(SelectRandomK): + + def __init__(self, vectors): + super().__init__(vectors, 5) + + @staticmethod + def name(): + return "rand5" + +class SelectTopK(AbstractSelectionMethod): + + def __init__(self, vectors, k): + super().__init__(vectors, k) + + def select(self): + self.vectors.sort(key=lambda x: x['length'], reverse=True) + return [x['vector'] for x in self.vectors[:self.max_num]] + + @staticmethod + def name(): + raise NotImplementedError + +class SelectTop1(SelectTopK): + + def __init__(self, vectors): + super().__init__(vectors, 1) + + @staticmethod + def name(): + return "top1" + +class SelectTop2(SelectTopK): + + def __init__(self, vectors): + super().__init__(vectors, 2) + + @staticmethod + def name(): + return "top2" + +class SelectTop3(SelectTopK): + + def __init__(self, vectors): + super().__init__(vectors, 3) + + @staticmethod + def name(): + return "top3" + +class SelectTop5(SelectTopK): + + def __init__(self, vectors): + super().__init__(vectors, 5) + + @staticmethod + def name(): + return "top5" + +selection_methods = [ + SelectAll, + SelectRandom1, + SelectRandom2, + SelectRandom3, + SelectRandom5, + SelectTop1, + SelectTop2, + SelectTop3, + SelectTop5 +] diff --git a/weka_files/README.md b/pipeline/weka_files/README.md similarity index 100% rename from weka_files/README.md rename to pipeline/weka_files/README.md