Skip to content

Commit

Permalink
Split dataset by filenames after filtering
Browse files Browse the repository at this point in the history
  • Loading branch information
lyriccoder committed Jul 6, 2020
1 parent 86f3528 commit bab2311
Showing 1 changed file with 44 additions and 32 deletions.
76 changes: 44 additions & 32 deletions scripts/02-filter-and-move.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,17 @@
required=False,
default=None
)
parser.add_argument(
'--split_only',
required=False,
help='Only split filenames into train and test, do not filter',
default=False,
action='store_true'
)
args = parser.parse_args()
MAX_CLASSES = args.max_classes
TXT_OUT = 'found-java-files.txt'
SPLIT_CSV = 'split.csv'
CSV_OUT = '02-java-files.csv'

DIR_TO_CREATE = 'target/02'
Expand All @@ -66,7 +74,6 @@ class ClassType(Enum):
ABSTRACT_CLASS = 3
TEST = 4
JAVA_PARSE_ERROR = 5
NESTED_CLASSES = 6
CLASS = 999


Expand All @@ -80,27 +87,23 @@ def get_class_type(filename: Path):
class_type = ClassType.CLASS
try:
tree = javalang.parse.parse(text)
classes = list(tree.filter(javalang.tree.ClassDeclaration))
if len(classes) > 1:
class_type = ClassType.NESTED_CLASSES
else:
for _, node in tree:
if type(node) == javalang.tree.InterfaceDeclaration:
class_type = ClassType.INTERFACE
for _, node in tree:
if type(node) == javalang.tree.InterfaceDeclaration:
class_type = ClassType.INTERFACE
break
elif type(node) == javalang.tree.EnumDeclaration:
class_type = ClassType.ENUM
break
elif type(node) == javalang.tree.ClassDeclaration:
if 'abstract' in node.modifiers:
class_type = ClassType.ABSTRACT_CLASS
break
elif type(node) == javalang.tree.EnumDeclaration:
class_type = ClassType.ENUM
elif 'Test' in node.name:
class_type = ClassType.TEST
break
else:
class_type = ClassType.CLASS
break
elif type(node) == javalang.tree.ClassDeclaration:
if 'abstract' in node.modifiers:
class_type = ClassType.ABSTRACT_CLASS
break
elif 'Test' in node.name:
class_type = ClassType.TEST
break
else:
class_type = ClassType.CLASS
break
except Exception:
class_type = ClassType.JAVA_PARSE_ERROR
return class_type
Expand Down Expand Up @@ -196,17 +199,26 @@ def call_back():


if __name__ == '__main__':
start = time.time()
results = walk_in_parallel()

if not os.path.isdir(DIR_TO_CREATE):
os.makedirs(DIR_TO_CREATE)

path_csv_out = str(Path(current_location, DIR_TO_CREATE, CSV_OUT))
path_txt_out = str(Path(current_location, DIR_TO_CREATE, TXT_OUT))
df = pd.DataFrame(results, columns=['filename', 'class_type'])
df = df[df['class_type'] == 999]
df.to_csv(path_csv_out, index=False)
df['filename'].to_csv(path_txt_out, header=None, index=None)
end = time.time()
print('It took ' + str(end - start) + ' seconds')

if not args.split_only:
start = time.time()
results = walk_in_parallel()

if not os.path.isdir(DIR_TO_CREATE):
os.makedirs(DIR_TO_CREATE)

df = pd.DataFrame(results, columns=['filename', 'class_type'])
df = df[df['class_type'] == 999]
df.to_csv(path_csv_out, index=False)
df['filename'].to_csv(path_txt_out, header=None, index=None)
end = time.time()
print('It took ' + str(end - start) + ' seconds')
from sklearn.model_selection import train_test_split
df = pd.read_csv(path_csv_out)
train, test = train_test_split(df['filename'], test_size=0.3, random_state=42)
train_csv_file = str(Path(current_location, DIR_TO_CREATE, '02-train.csv'))
test_csv_file = str(Path(current_location, DIR_TO_CREATE, '02-test.csv'))
train.to_csv(train_csv_file, index=False)
test.to_csv(test_csv_file, index=False)

0 comments on commit bab2311

Please sign in to comment.