From 3de91511e3b8d2f932b0ccdfe155e3a1c13e773b Mon Sep 17 00:00:00 2001 From: Evgeny Maslov Date: Mon, 19 Oct 2020 17:35:08 +0300 Subject: [PATCH 01/17] Add validation procedure --- veniq/dataset_collection/validation.py | 91 ++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 veniq/dataset_collection/validation.py diff --git a/veniq/dataset_collection/validation.py b/veniq/dataset_collection/validation.py new file mode 100644 index 00000000..bfe5f704 --- /dev/null +++ b/veniq/dataset_collection/validation.py @@ -0,0 +1,91 @@ +from pathlib import Path + +import pandas as pd + +from baselines.semi.create_extraction_opportunities import create_extraction_opportunities +from baselines.semi.extract_semantic import extract_method_statements_semantic +from baselines.semi.filter_extraction_opportunities import filter_extraction_opportunities +from baselines.semi.rank_extraction_opportunities import rank_extraction_opportunities +from utils.ast_builder import build_ast +from veniq.ast_framework import AST, ASTNodeType +from random import choice + + +def _print_extraction_opportunities( + method_ast: AST): + statements_semantic = extract_method_statements_semantic(method_ast) + extraction_opportunities = create_extraction_opportunities(statements_semantic) + filtered_extraction_opportunities = filter_extraction_opportunities( + extraction_opportunities, statements_semantic, method_ast + ) + extraction_opportunities_groups = rank_extraction_opportunities( + statements_semantic, filtered_extraction_opportunities + ) + + # print( + # f"Extraction opportunities groups of method {method_name} in class {class_name} in file {filepath}:" + # ) + + # for extraction_opportunity_group in extraction_opportunities_groups: + # print(f"\tExtraction opportunities group with scope {extraction_opportunity_group.benifit}:") + # for extraction_opportunity, benifit in extraction_opportunity_group.opportunities: + # print(f"\t\tExtraction opportunity with score {benifit}:") + # for statement in extraction_opportunity: + # print(f"\t\t\t{statement.node_type} on line {statement.line}") + return extraction_opportunities_groups + + +if __name__ == '__main__': + dir_with_dataset = Path(r'D:\temp\dataset_colelction_refactoring\small_dataset') + df = pd.read_csv(Path(dir_with_dataset) / r'out.csv') + failed_cases_in_SEMI_algorithm = 0 + failed_cases_in_validation_examples = 0 + matched_cases = 0 + no_opportunity_chosen = 0 + total_number = df.shape[0] + for row in df.iterrows(): + start_line = row[1]['start_line'] + end_line = row[1]['end_line'] + src_filename = row[1]['output_filename'] + class_name = row[1]['className'] + + print(class_name) + try: + ast = AST.build_from_javalang(build_ast(dir_with_dataset / src_filename)) + except Exception as e: + failed_cases_in_validation_examples += 1 + + function_to_analyze = row[1]['invocation function name'] + for class_decl in ast.get_proxy_nodes(ASTNodeType.CLASS_DECLARATION): + class_ast = ast.get_subtree(class_decl) + if class_decl.name != class_name: + continue + + for method_decl in class_ast.get_proxy_nodes(ASTNodeType.METHOD_DECLARATION): + if method_decl.name != function_to_analyze: + continue + + try: + opport = _print_extraction_opportunities( + ast.get_subtree(method_decl) + ) + if opport: + best_group = opport[0] + best_opportunity, benefit = choice(list(best_group.opportunities)) + lines = [node.line for node in best_opportunity] + start_line_opportunity = min(lines) + end_line_opportunity = max(lines) + if (start_line == start_line_opportunity) and (end_line == end_line_opportunity): + matched_cases += 1 + else: + no_opportunity_chosen += 0 + print(class_decl.name, method_decl.name) + + except Exception as e: + failed_cases_in_SEMI_algorithm += 1 + + break + + matched = (failed_cases_in_SEMI_algorithm + failed_cases_in_validation_examples + + matched_cases + no_opportunity_chosen) + print(float(matched) / total_number) From 85cbc80742af0003bd08210da9488b6f1c503b0f Mon Sep 17 00:00:00 2001 From: Evgeny Maslov Date: Mon, 19 Oct 2020 17:50:33 +0300 Subject: [PATCH 02/17] Fix --- veniq/dataset_collection/validation.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/veniq/dataset_collection/validation.py b/veniq/dataset_collection/validation.py index bfe5f704..6c7ef4fe 100644 --- a/veniq/dataset_collection/validation.py +++ b/veniq/dataset_collection/validation.py @@ -82,10 +82,17 @@ def _print_extraction_opportunities( print(class_decl.name, method_decl.name) except Exception as e: + import traceback + traceback.print_exc() failed_cases_in_SEMI_algorithm += 1 break + print(f'Failed SEMI algorithm errors: {failed_cases_in_SEMI_algorithm}') + print(f'Failed examples of synth dataset: {failed_cases_in_validation_examples}') + print(f'matched_cases: {matched_cases}') + print(f'No opportunity chosen: {no_opportunity_chosen} times') + print(f'Total number of cases: {total_number}') matched = (failed_cases_in_SEMI_algorithm + failed_cases_in_validation_examples + matched_cases + no_opportunity_chosen) print(float(matched) / total_number) From 802b94169d4e01d3c29dc31bf22d6a57b6619018 Mon Sep 17 00:00:00 2001 From: Evgeny Maslov Date: Thu, 22 Oct 2020 18:15:19 +0300 Subject: [PATCH 03/17] Fix validation --- veniq/dataset_collection/validation.py | 102 ++++++++++++++++--------- 1 file changed, 65 insertions(+), 37 deletions(-) diff --git a/veniq/dataset_collection/validation.py b/veniq/dataset_collection/validation.py index 6c7ef4fe..2642472a 100644 --- a/veniq/dataset_collection/validation.py +++ b/veniq/dataset_collection/validation.py @@ -1,3 +1,4 @@ +from argparse import ArgumentParser from pathlib import Path import pandas as pd @@ -6,6 +7,7 @@ from baselines.semi.extract_semantic import extract_method_statements_semantic from baselines.semi.filter_extraction_opportunities import filter_extraction_opportunities from baselines.semi.rank_extraction_opportunities import rank_extraction_opportunities +from dataset_collection.augmentation import method_body_lines from utils.ast_builder import build_ast from veniq.ast_framework import AST, ASTNodeType from random import choice @@ -36,63 +38,89 @@ def _print_extraction_opportunities( if __name__ == '__main__': - dir_with_dataset = Path(r'D:\temp\dataset_colelction_refactoring\small_dataset') - df = pd.read_csv(Path(dir_with_dataset) / r'out.csv') + parser = ArgumentParser() + parser.add_argument( + "-d", "--dataset_dir", + help="Path for file with output results", + required=True + ) + parser.add_argument( + "-i", "--csv_input", + help="Path for csv" + ) + args = parser.parse_args() + dataset_dir = Path(args.dataset_dir) + csv_dataset_filename = Path(args.csv_input) + df = pd.read_csv(csv_dataset_filename) failed_cases_in_SEMI_algorithm = 0 failed_cases_in_validation_examples = 0 matched_cases = 0 no_opportunity_chosen = 0 total_number = df.shape[0] + matched_percent = 0 + # f = r'D:\temp\dataset_colelction_refactoring\small_dataset\output_files\SecurityConstraintPanel_setValue_192.java' + # ast = AST.build_from_javalang(build_ast(f)) + # class_t = [x for x in ast.get_proxy_nodes(ASTNodeType.CLASS_DECLARATION) if x.name == 'SecurityConstraintPanel'][0] + # method_decl = [x for x in ast.get_proxy_nodes(ASTNodeType.METHOD_DECLARATION) if x.name == 'refillUserDataConstraint'][0] + # body_start_line, body_end_line = method_body_lines(method_decl, f) + # print(body_start_line, body_end_line) + iteration_number = 0 + for row in df.iterrows(): + iteration_number += 1 start_line = row[1]['start_line'] end_line = row[1]['end_line'] src_filename = row[1]['output_filename'] class_name = row[1]['className'] - - print(class_name) try: - ast = AST.build_from_javalang(build_ast(dir_with_dataset / src_filename)) - except Exception as e: - failed_cases_in_validation_examples += 1 + ast = AST.build_from_javalang(build_ast(dataset_dir / src_filename)) + function_to_analyze = row[1]['invocation function name'] + for class_decl in ast.get_proxy_nodes(ASTNodeType.CLASS_DECLARATION): + # class_ast = ast.get_subtree(class_decl) + if class_decl.name != class_name: + continue + elif class_decl.name == class_name: + for method_decl in class_decl.methods: + if method_decl.name != function_to_analyze: + continue + try: + print( + f'Trying analyze {class_decl.name} {method_decl.name} {iteration_number}/{total_number}') + opport = _print_extraction_opportunities( + ast.get_subtree(method_decl) + ) + if opport: + best_group = opport[0] + lines = [node.line for node in best_group._optimal_opportunity] + start_line_opportunity = min(lines) + end_line_opportunity = max(lines) + lines_intersected = set(range(start_line, end_line)) & set(lines) - function_to_analyze = row[1]['invocation function name'] - for class_decl in ast.get_proxy_nodes(ASTNodeType.CLASS_DECLARATION): - class_ast = ast.get_subtree(class_decl) - if class_decl.name != class_name: - continue + if (start_line == start_line_opportunity) and (end_line == end_line_opportunity): + matched_cases += 1 + matched_percent += float(len(lines_intersected)) / len(lines) + else: + no_opportunity_chosen += 0 + print(class_decl.name, method_decl.name) - for method_decl in class_ast.get_proxy_nodes(ASTNodeType.METHOD_DECLARATION): - if method_decl.name != function_to_analyze: - continue + except Exception as e: + import traceback - try: - opport = _print_extraction_opportunities( - ast.get_subtree(method_decl) - ) - if opport: - best_group = opport[0] - best_opportunity, benefit = choice(list(best_group.opportunities)) - lines = [node.line for node in best_opportunity] - start_line_opportunity = min(lines) - end_line_opportunity = max(lines) - if (start_line == start_line_opportunity) and (end_line == end_line_opportunity): - matched_cases += 1 - else: - no_opportunity_chosen += 0 - print(class_decl.name, method_decl.name) + traceback.print_exc() + failed_cases_in_SEMI_algorithm += 1 - except Exception as e: - import traceback - traceback.print_exc() - failed_cases_in_SEMI_algorithm += 1 + break + break - break + except Exception as e: + failed_cases_in_validation_examples += 1 print(f'Failed SEMI algorithm errors: {failed_cases_in_SEMI_algorithm}') print(f'Failed examples of synth dataset: {failed_cases_in_validation_examples}') print(f'matched_cases: {matched_cases}') print(f'No opportunity chosen: {no_opportunity_chosen} times') print(f'Total number of cases: {total_number}') - matched = (failed_cases_in_SEMI_algorithm + failed_cases_in_validation_examples - + matched_cases + no_opportunity_chosen) + print(f'Total number of matched lines: {matched_percent}') + matched = (matched_cases + no_opportunity_chosen) + total_number = total_number - failed_cases_in_SEMI_algorithm - failed_cases_in_validation_examples print(float(matched) / total_number) From 9c3dc65f7ab6fea857faa6e3b9d96e8551e1d27f Mon Sep 17 00:00:00 2001 From: Evgeny Maslov Date: Mon, 26 Oct 2020 12:29:00 +0300 Subject: [PATCH 04/17] Remove template request --- .github/pull_request_template.md | 7 ------- veniq/dataset_collection/validation.py | 3 ++- 2 files changed, 2 insertions(+), 8 deletions(-) delete mode 100644 .github/pull_request_template.md diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md deleted file mode 100644 index 2191affe..00000000 --- a/.github/pull_request_template.md +++ /dev/null @@ -1,7 +0,0 @@ -Please specify the following in the description: - -- Meaningful title. Use the tempalte: ```[Milestone]```.```PullRequest_Title``` -- A reference to the original GitHub issue using # selector. -- A description/notes about this pull request. -- If you fix a bug don't forget about test case. -- If you add a new feature don't forget about test case. diff --git a/veniq/dataset_collection/validation.py b/veniq/dataset_collection/validation.py index 2642472a..6b09d2f8 100644 --- a/veniq/dataset_collection/validation.py +++ b/veniq/dataset_collection/validation.py @@ -52,6 +52,7 @@ def _print_extraction_opportunities( dataset_dir = Path(args.dataset_dir) csv_dataset_filename = Path(args.csv_input) df = pd.read_csv(csv_dataset_filename) + df_is_parsed = df[df['can_be_parsed']] failed_cases_in_SEMI_algorithm = 0 failed_cases_in_validation_examples = 0 matched_cases = 0 @@ -66,7 +67,7 @@ def _print_extraction_opportunities( # print(body_start_line, body_end_line) iteration_number = 0 - for row in df.iterrows(): + for row in df_is_parsed.iterrows(): iteration_number += 1 start_line = row[1]['start_line'] end_line = row[1]['end_line'] From a3a6e233313833ace3fa0c4358cb18b91748005f Mon Sep 17 00:00:00 2001 From: Evgeny Maslov Date: Mon, 26 Oct 2020 14:52:23 +0300 Subject: [PATCH 05/17] Fix print --- veniq/dataset_collection/validation.py | 30 +++++++++++++++++--------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/veniq/dataset_collection/validation.py b/veniq/dataset_collection/validation.py index 6b09d2f8..16871be4 100644 --- a/veniq/dataset_collection/validation.py +++ b/veniq/dataset_collection/validation.py @@ -69,13 +69,19 @@ def _print_extraction_opportunities( for row in df_is_parsed.iterrows(): iteration_number += 1 - start_line = row[1]['start_line'] - end_line = row[1]['end_line'] + start_line_of_invocation_occurred = row[1]['start_line_of_function_where_invocation_occurred'] + start_line_of_invoked_function = row[1]['invocation_method_start_line'] + end_line_of_invoked_function = row[1]['invocation_method_end_line'] + end_line_of_invocation_occurred = end_line_of_invoked_function - start_line_of_invoked_function + lines_inserted = end_line_of_invocation_occurred - start_line_of_invocation_occurred + if lines_inserted >= 1: + continue + src_filename = row[1]['output_filename'] - class_name = row[1]['className'] + class_name = row[1]['class_name'] try: ast = AST.build_from_javalang(build_ast(dataset_dir / src_filename)) - function_to_analyze = row[1]['invocation function name'] + function_to_analyze = row[1]['invocation_method_name'] for class_decl in ast.get_proxy_nodes(ASTNodeType.CLASS_DECLARATION): # class_ast = ast.get_subtree(class_decl) if class_decl.name != class_name: @@ -85,8 +91,9 @@ def _print_extraction_opportunities( if method_decl.name != function_to_analyze: continue try: - print( - f'Trying analyze {class_decl.name} {method_decl.name} {iteration_number}/{total_number}') + # print( + # f'Trying analyze {class_decl.name} {method_decl.name} ' + # f'{iteration_number}/{total_number}') opport = _print_extraction_opportunities( ast.get_subtree(method_decl) ) @@ -95,18 +102,21 @@ def _print_extraction_opportunities( lines = [node.line for node in best_group._optimal_opportunity] start_line_opportunity = min(lines) end_line_opportunity = max(lines) - lines_intersected = set(range(start_line, end_line)) & set(lines) + lines_intersected = set( + range(end_line_of_invocation_occurred, end_line_of_invocation_occurred)) \ + & set(lines) - if (start_line == start_line_opportunity) and (end_line == end_line_opportunity): + if (start_line_of_invocation_occurred == start_line_opportunity) \ + and (end_line_of_invocation_occurred == end_line_opportunity): matched_cases += 1 matched_percent += float(len(lines_intersected)) / len(lines) else: no_opportunity_chosen += 0 - print(class_decl.name, method_decl.name) + # print(class_decl.name, method_decl.name) except Exception as e: import traceback - + print(src_filename) traceback.print_exc() failed_cases_in_SEMI_algorithm += 1 From f128e4ed2994d3a4a00910905b4f0587e44c0677 Mon Sep 17 00:00:00 2001 From: Evgeny Maslov Date: Mon, 26 Oct 2020 15:03:18 +0300 Subject: [PATCH 06/17] Fix imports --- veniq/dataset_collection/validation.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/veniq/dataset_collection/validation.py b/veniq/dataset_collection/validation.py index 16871be4..5a7d9a37 100644 --- a/veniq/dataset_collection/validation.py +++ b/veniq/dataset_collection/validation.py @@ -3,12 +3,12 @@ import pandas as pd -from baselines.semi.create_extraction_opportunities import create_extraction_opportunities -from baselines.semi.extract_semantic import extract_method_statements_semantic -from baselines.semi.filter_extraction_opportunities import filter_extraction_opportunities -from baselines.semi.rank_extraction_opportunities import rank_extraction_opportunities -from dataset_collection.augmentation import method_body_lines -from utils.ast_builder import build_ast +from veniq.baselines.semi.create_extraction_opportunities import create_extraction_opportunities +from veniq.baselines.semi.extract_semantic import extract_method_statements_semantic +from veniq.baselines.semi.filter_extraction_opportunities import filter_extraction_opportunities +from veniq.baselines.semi.rank_extraction_opportunities import rank_extraction_opportunities +from veniq.dataset_collection.augmentation import method_body_lines +from veniq.utils.ast_builder import build_ast from veniq.ast_framework import AST, ASTNodeType from random import choice From e50e37fe515fdd22408a33f2ea656c9899da6a36 Mon Sep 17 00:00:00 2001 From: Evgeny Maslov Date: Mon, 26 Oct 2020 17:56:32 +0300 Subject: [PATCH 07/17] Temp rpint --- veniq/dataset_collection/augmentation.py | 600 +++++------------------ 1 file changed, 119 insertions(+), 481 deletions(-) diff --git a/veniq/dataset_collection/augmentation.py b/veniq/dataset_collection/augmentation.py index 0622e26e..5fed8d7b 100644 --- a/veniq/dataset_collection/augmentation.py +++ b/veniq/dataset_collection/augmentation.py @@ -1,500 +1,138 @@ -import hashlib -import os -import os.path -import shutil -import tarfile -import typing from argparse import ArgumentParser -from collections import defaultdict -from functools import partial from pathlib import Path -from typing import Tuple, Dict, List, Any, Set, Optional import pandas as pd -from pebble import ProcessPool -from tqdm import tqdm -from veniq.metrics.ncss.ncss import NCSSMetric -from veniq.ast_framework import AST, ASTNodeType, ASTNode -from veniq.dataset_collection.types_identifier import AlgorithmFactory, InlineTypesAlgorithms +from veniq.baselines.semi.create_extraction_opportunities import create_extraction_opportunities +from veniq.baselines.semi.extract_semantic import extract_method_statements_semantic +from veniq.baselines.semi.filter_extraction_opportunities import filter_extraction_opportunities +from veniq.baselines.semi.rank_extraction_opportunities import rank_extraction_opportunities +from veniq.dataset_collection.augmentation import method_body_lines from veniq.utils.ast_builder import build_ast -from veniq.utils.encoding_detector import read_text_with_autodetected_encoding +from veniq.ast_framework import AST, ASTNodeType +from random import choice -def _get_last_line(file_path: Path, start_line: int) -> int: - """ - This function is aimed to find the last body line of - considered method. It work by counting the difference - in number of openning brackets '{' and closing brackets - '}'. It's start with the method declaration line and going - to the line where the difference is equal to 0. Which means - that we found closind bracket of method declaration. - """ - with open(file_path, encoding='utf-8') as f: - file_lines = list(f) - # to start counting opening brackets - difference_cases = 0 - - processed_declaration_line = file_lines[start_line - 1].split('//')[0] - difference_cases += processed_declaration_line.count('{') - difference_cases -= processed_declaration_line.count('}') - for i, line in enumerate(file_lines[start_line:], start_line): - if difference_cases: - line_without_comments = line.split('//')[0] - difference_cases += line_without_comments.count('{') - difference_cases -= line_without_comments.count('}') - else: - return i - - return -1 - - -def get_line_with_first_open_bracket( - file_path: Path, - method_decl_start_line: int -) -> int: - f = open(file_path, encoding='utf-8') - file_lines = list(f) - for i, line in enumerate(file_lines[method_decl_start_line - 2:], method_decl_start_line - 2): - if '{' in line: - return i + 1 - return method_decl_start_line + 1 - - -def method_body_lines(method_node: ASTNode, file_path: Path) -> Tuple[int, int]: - """ - Get start and end of method's body - """ - if len(method_node.body): - m_decl_start_line = start_line = method_node.line + 1 - start_line = get_line_with_first_open_bracket(file_path, m_decl_start_line) - end_line = _get_last_line(file_path, start_line) - else: - start_line = end_line = -1 - return start_line, end_line - - -@typing.no_type_check -def is_match_to_the_conditions( - ast: AST, - method_invoked: ASTNode, - found_method_decl=None) -> bool: - if method_invoked.parent.node_type == ASTNodeType.THIS: - parent = method_invoked.parent.parent - class_names = [x for x in method_invoked.parent.children if hasattr(x, 'string')] - member_references = [x for x in method_invoked.parent.children if hasattr(x, 'member')] - lst = [x for x in member_references if x.member != method_invoked.member] + class_names - no_children = not lst - else: - parent = method_invoked.parent - no_children = True - - maybe_if = parent.parent - is_not_method_inv_single_statement_in_if = True - if maybe_if.node_type == ASTNodeType.IF_STATEMENT: - if hasattr(maybe_if.then_statement, 'expression'): - if maybe_if.then_statement.expression.node_type == ASTNodeType.METHOD_INVOCATION: - is_not_method_inv_single_statement_in_if = False - - is_not_assign_value_with_return_type = True - is_not_several_returns = True - if found_method_decl.return_type: - if parent.node_type == ASTNodeType.VARIABLE_DECLARATOR: - is_not_assign_value_with_return_type = False - - ast_subtree = ast.get_subtree(found_method_decl) - stats = [x for x in ast_subtree.get_proxy_nodes(ASTNodeType.RETURN_STATEMENT)] - if len(stats) > 1: - is_not_several_returns = False - - is_not_parent_member_ref = not (method_invoked.parent.node_type == ASTNodeType.MEMBER_REFERENCE) - is_not_chain_before = not (parent.node_type == ASTNodeType.METHOD_INVOCATION) and no_children - chains_after = [x for x in method_invoked.children if x.node_type == ASTNodeType.METHOD_INVOCATION] - is_not_chain_after = not chains_after - is_not_inside_if = not (parent.node_type == ASTNodeType.IF_STATEMENT) - is_not_inside_while = not (parent.node_type == ASTNodeType.WHILE_STATEMENT) - is_not_inside_for = not (parent.node_type == ASTNodeType.FOR_STATEMENT) - is_not_enhanced_for_control = not (parent.node_type == ASTNodeType.ENHANCED_FOR_CONTROL) - # ignore case else if (getServiceInterface() != null) { - is_not_binary_operation = not (parent.node_type == ASTNodeType.BINARY_OPERATION) - is_not_ternary = not (parent.node_type == ASTNodeType.TERNARY_EXPRESSION) - # if a parameter is any expression, we ignore it, - # since it is difficult to extract with AST - is_actual_parameter_simple = all([hasattr(x, 'member') for x in method_invoked.arguments]) - is_not_class_creator = not (parent.node_type == ASTNodeType.CLASS_CREATOR) - is_not_cast = not (parent.node_type == ASTNodeType.CAST) - is_not_array_creator = not (parent.node_type == ASTNodeType.ARRAY_CREATOR) - is_not_lambda = not (parent.node_type == ASTNodeType.LAMBDA_EXPRESSION) - other_requirements = all([ - is_not_chain_before, - is_actual_parameter_simple, - is_not_chain_after, - is_not_inside_if, - is_not_inside_while, - is_not_binary_operation, - is_not_ternary, - is_not_class_creator, - is_not_cast, - is_not_array_creator, - is_not_parent_member_ref, - is_not_inside_for, - is_not_enhanced_for_control, - is_not_lambda, - is_not_method_inv_single_statement_in_if, - is_not_assign_value_with_return_type, - is_not_several_returns, - not method_invoked.arguments]) - - if (not method_invoked.qualifier and other_requirements) or \ - (method_invoked.qualifier == 'this' and other_requirements): - return True - else: - return False - - -def check_whether_method_has_return_type( - method_decl: AST, - var_decls: Set[str]) -> InlineTypesAlgorithms: - """ - Run function to check whether Method declaration can be inlined - :param method_decl: method, where invocation occurred - :param var_decls: set of variables for found invoked method - :return: enum InlineTypesAlgorithms - """ - names = get_variables_decl_in_node(method_decl) - - var_decls_original = set(names) - intersected_names = var_decls & var_decls_original - # if we do not have intersected name in target method and inlined method - # and if we do not have var declarations at all - if not var_decls or not intersected_names: - return InlineTypesAlgorithms.WITHOUT_RETURN_WITHOUT_ARGUMENTS - - return InlineTypesAlgorithms.DO_NOTHING - - -def get_variables_decl_in_node( - method_decl: AST) -> List[str]: - names = [] - for x in method_decl.get_proxy_nodes(ASTNodeType.VARIABLE_DECLARATOR): - if hasattr(x, 'name'): - names.append(x.name) - elif hasattr(x, 'names'): - names.extend(x.names) - - for x in method_decl.get_proxy_nodes(ASTNodeType.VARIABLE_DECLARATION): - if hasattr(x, 'name'): - names.append(x.name) - elif hasattr(x, 'names'): - names.extend(x.names) - - for x in method_decl.get_proxy_nodes(ASTNodeType.TRY_RESOURCE): - names.append(x.name) - - return names - - -def determine_algorithm_insertion_type( - ast: AST, - method_node: ASTNode, - invocation_node: ASTNode, - dict_original_nodes: Dict[str, List[ASTNode]] -) -> InlineTypesAlgorithms: - """ - - :param ast: ast tree - :param dict_original_nodes: dict with names of function as key - and list of ASTNode as values - :param method_node: Method declaration. In this method invocation occurred - :param invocation_node: invocation node - :return: InlineTypesAlgorithms enum - """ - - original_invoked_method = dict_original_nodes.get(invocation_node.member, []) - # ignore overridden functions - if (len(original_invoked_method) == 0) or (len(original_invoked_method) > 1): - return InlineTypesAlgorithms.DO_NOTHING - else: - original_method = original_invoked_method[0] - if not original_method.parameters: - if not original_method.return_type: - # Find the original method declaration by the name of method invocation - var_decls = set(get_variables_decl_in_node(ast.get_subtree(original_method))) - return check_whether_method_has_return_type( - ast.get_subtree(method_node), - var_decls - ) - else: - return InlineTypesAlgorithms.WITH_RETURN_WITHOUT_ARGUMENTS - else: - return InlineTypesAlgorithms.DO_NOTHING - - -def insert_code_with_new_file_creation( - class_name: str, - ast: AST, - method_node: ASTNode, - invocation_node: ASTNode, - file_path: Path, - output_path: Path, - dict_original_invocations: Dict[str, List[ASTNode]] -) -> Dict[str, Any]: - """ - If invocations of class methods were found, - we process through all of them and for each - substitution opportunity by method's body, - we create new file. - """ - file_name = file_path.stem - if not os.path.exists(output_path): - output_path.mkdir(parents=True) - - new_full_filename = Path(output_path, f'{file_name}_{method_node.name}_{invocation_node.line}.java') - original_func = dict_original_invocations.get(invocation_node.member)[0] # type: ignore - ncss = NCSSMetric().value(ast.get_subtree(original_func)) - line_to_csv = {} - if ncss > 3: - body_start_line, body_end_line = method_body_lines(original_func, file_path) - text_lines = read_text_with_autodetected_encoding(str(file_path)).split('\n') - if body_start_line != body_end_line: - algorithm_type = determine_algorithm_insertion_type( - ast, - method_node, - invocation_node, - dict_original_invocations - ) - algorithm_for_inlining = AlgorithmFactory().create_obj(algorithm_type) - if algorithm_type != InlineTypesAlgorithms.DO_NOTHING: - line_to_csv = { - 'input_filename': file_path, - 'class_name': class_name, - 'invocation_text_string': text_lines[invocation_node.line - 1].lstrip(), - 'method_where_invocation_occurred': method_node.name, - 'start_line_of_function_where_invocation_occurred': method_node.line, - 'invocation_method_name': original_func.name, - 'invocation_method_start_line': body_start_line, - 'invocation_method_end_line': body_end_line, - 'output_filename': new_full_filename, - } - - algorithm_for_inlining().inline_function( - file_path, - invocation_node.line, - body_start_line, - body_end_line, - new_full_filename, - ) - - # if get_ast_if_possible(Path(r'D:\temp\AbstractComponent_addBefore_259.java')): - if get_ast_if_possible(Path(new_full_filename)): - can_be_parsed = True - else: - can_be_parsed = False - - line_to_csv['can_be_parsed'] = can_be_parsed - - return line_to_csv - - -def get_ast_if_possible(file_path: Path) -> Optional[AST]: - """ - Processing file in order to check - that its original version can be parsed - """ - ast = None - try: - ast = AST.build_from_javalang(build_ast(str(file_path))) - except Exception: - print(f"Processing {file_path} is aborted due to parsing") - return ast - - -def analyze_file(file_path: Path, output_path: Path) -> List[Any]: - """ - In this function we process each file. - For each file we find each invocation inside, - which can be inlined. - """ - # print(file_path) - results: List[Any] = [] - ast = get_ast_if_possible(file_path) - if ast is None: - return results - - method_declarations = defaultdict(list) - classes_declaration = [ - ast.get_subtree(node) - for node in ast.get_root().types - if node.node_type == ASTNodeType.CLASS_DECLARATION - ] - for class_ast in classes_declaration: - class_declaration = class_ast.get_root() - for method in class_declaration.methods: - if not method.parameters: - method_declarations[method.name].append(method) - - methods_list = list(class_declaration.methods) + list(class_declaration.constructors) - for method_node in methods_list: - method_decl = ast.get_subtree(method_node) - for method_invoked in method_decl.get_proxy_nodes( - ASTNodeType.METHOD_INVOCATION): - found_method_decl = method_declarations.get(method_invoked.member, []) - # ignore overloaded functions - if len(found_method_decl) == 1: - try: - is_matched = is_match_to_the_conditions( - ast, - method_invoked, - found_method_decl[0] - ) - if is_matched: - log_of_inline = insert_code_with_new_file_creation( - class_declaration.name, - ast, - method_node, - method_invoked, - file_path, - output_path, - method_declarations) - if log_of_inline: - results.append(log_of_inline) - except Exception as e: - print('Error has happened during file analyze: ' + str(e)) - return results +def _print_extraction_opportunities( + method_ast: AST): + statements_semantic = extract_method_statements_semantic(method_ast) + extraction_opportunities = create_extraction_opportunities(statements_semantic) + filtered_extraction_opportunities = filter_extraction_opportunities( + extraction_opportunities, statements_semantic, method_ast + ) + extraction_opportunities_groups = rank_extraction_opportunities( + statements_semantic, filtered_extraction_opportunities + ) + # print( + # f"Extraction opportunities groups of method {method_name} in class {class_name} in file {filepath}:" + # ) -def save_input_file(input_dir: Path, filename: Path) -> Path: - # need to avoid situation when filenames are the same - hash_path = hashlib.sha256(str(filename.parent).encode('utf-8')).hexdigest() - dst_filename = input_dir / f'{filename.stem}_{hash_path}.java' - if not dst_filename.parent.exists(): - dst_filename.parent.mkdir(parents=True) - if not dst_filename.exists(): - shutil.copyfile(filename, dst_filename) - return dst_filename + # for extraction_opportunity_group in extraction_opportunities_groups: + # print(f"\tExtraction opportunities group with scope {extraction_opportunity_group.benifit}:") + # for extraction_opportunity, benifit in extraction_opportunity_group.opportunities: + # print(f"\t\tExtraction opportunity with score {benifit}:") + # for statement in extraction_opportunity: + # print(f"\t\t\t{statement.node_type} on line {statement.line}") + return extraction_opportunities_groups -if __name__ == '__main__': # noqa: C901 - system_cores_qty = os.cpu_count() or 1 +if __name__ == '__main__': parser = ArgumentParser() parser.add_argument( - "-d", "--dir", required=True, help="File path to JAVA source code for methods augmentations" - ) - parser.add_argument( - "-o", "--output", + "-d", "--dataset_dir", help="Path for file with output results", - default='augmented_data' - ) - parser.add_argument( - "--jobs", - "-j", - type=int, - default=system_cores_qty - 1, - help="Number of processes to spawn. " - "By default one less than number of cores. " - "Be careful to raise it above, machine may stop responding while creating dataset.", - ) - parser.add_argument( - "-z", "--zip", - action='store_true', - help="To zip input and output files." + required=True ) parser.add_argument( - "-s", "--small_dataset_size", - help="Number of files in small dataset", - default=100, - type=int, + "-i", "--csv_input", + help="Path for csv" ) - args = parser.parse_args() - - test_files = set(Path(args.dir).glob('**/*Test*.java')) - not_test_files = set(Path(args.dir).glob('**/*.java')) - files_without_tests = list(not_test_files.difference(test_files)) - - full_dataset_folder = Path(args.output) / 'full_dataset' - output_dir = full_dataset_folder / 'output_files' - if not output_dir.exists(): - output_dir.mkdir(parents=True) - - input_dir = full_dataset_folder / 'input_files' - if not input_dir.exists(): - input_dir.mkdir(parents=True) - csv_output = Path(full_dataset_folder, 'out.csv') - - df = pd.DataFrame( - columns=[ - 'input_filename', - 'class_name', - 'invocation_text_string', - 'method_where_invocation_occurred', - 'start_line_of_function_where_invocation_occurred', - 'invocation_method_name', - 'invocation_method_start_line', - 'invocation_method_end_line', - 'output_filename', - 'can_be_parsed' - ]) - - with ProcessPool(system_cores_qty) as executor: - p_analyze = partial(analyze_file, output_path=output_dir.absolute()) - future = executor.map(p_analyze, files_without_tests, timeout=1000, ) - result = future.result() - - # each 100 cycles we dump the results - iteration_cycle = 1000 - iteration_number = 0 - for filename in tqdm(files_without_tests): - try: - single_file_features = next(result) - if single_file_features: - for i in single_file_features: - dst_filename = save_input_file(input_dir, filename) - # change source filename, since it will be changed - i['input_filename'] = str(dst_filename.as_posix()) - # get local path for inlined filename - i['output_filename'] = i['output_filename'].relative_to(os.getcwd()).as_posix() - i['invocation_text_string'] = str(i['invocation_text_string']).encode('utf8') - df = df.append(i, ignore_index=True) - - if (iteration_number % iteration_cycle) == 0: - df.to_csv(csv_output) - iteration_number += 1 - except Exception: - continue - - df.to_csv(csv_output) - if args.zip: - samples = pd.read_csv(csv_output).sample(args.small_dataset_size, random_state=41) - small_dataset_folder = Path(args.output) / 'small_dataset' - if not small_dataset_folder.exists(): - small_dataset_folder.mkdir(parents=True) - small_input_dir = small_dataset_folder / 'input_files' - if not small_input_dir.exists(): - small_input_dir.mkdir(parents=True) - small_output_dir = small_dataset_folder / 'output_files' - if not small_output_dir.exists(): - small_output_dir.mkdir(parents=True) - - samples.to_csv(small_dataset_folder / 'out.csv') - for i in samples.iterrows(): - input_filename = i[1]['input_filename'] - dst_filename = small_input_dir / Path(input_filename).name - # print(f"Copy from {input_filename}, to {dst_filename}") - shutil.copyfile(input_filename, dst_filename) - output_filename = i[1]['output_filename'] - dst_filename = small_output_dir / Path(output_filename).name - # print(f"Copy from {output_filename}, to {dst_filename}") - shutil.copyfile(output_filename, dst_filename) - - with tarfile.open(Path(args.output) / 'small_dataset.tar.gz', "w:gz") as tar: - tar.add(str(small_dataset_folder), arcname=str(small_dataset_folder)) - - with tarfile.open(Path(args.output) / 'full_dataset.tar.gz', "w:gz") as tar: - tar.add(str(full_dataset_folder), arcname=str(full_dataset_folder)) - - if input_dir.exists(): - shutil.rmtree(full_dataset_folder) - - if small_dataset_folder.exists(): - shutil.rmtree(small_dataset_folder) + dataset_dir = Path(args.dataset_dir) + csv_dataset_filename = Path(args.csv_input) + df = pd.read_csv(csv_dataset_filename) + df_is_parsed = df[df['can_be_parsed']] + failed_cases_in_SEMI_algorithm = 0 + failed_cases_in_validation_examples = 0 + matched_cases = 0 + no_opportunity_chosen = 0 + total_number = df.shape[0] + matched_percent = 0 + # f = r'D:\temp\dataset_colelction_refactoring\small_dataset\output_files\SecurityConstraintPanel_setValue_192.java' + # ast = AST.build_from_javalang(build_ast(f)) + # class_t = [x for x in ast.get_proxy_nodes(ASTNodeType.CLASS_DECLARATION) if x.name == 'SecurityConstraintPanel'][0] + # method_decl = [x for x in ast.get_proxy_nodes(ASTNodeType.METHOD_DECLARATION) if x.name == 'refillUserDataConstraint'][0] + # body_start_line, body_end_line = method_body_lines(method_decl, f) + # print(body_start_line, body_end_line) + iteration_number = 0 + + for row in df_is_parsed.iterrows(): + iteration_number += 1 + start_line_of_invocation_occurred = row[1]['start_line_of_function_where_invocation_occurred'] + start_line_of_invoked_function = row[1]['invocation_method_start_line'] + end_line_of_invoked_function = row[1]['invocation_method_end_line'] + end_line_of_invocation_occurred = end_line_of_invoked_function - start_line_of_invoked_function + lines_inserted = end_line_of_invocation_occurred - start_line_of_invocation_occurred + if lines_inserted >= 1: + continue + + src_filename = row[1]['output_filename'] + class_name = row[1]['class_name'] + try: + print(dataset_dir / src_filename) + ast = AST.build_from_javalang(build_ast(dataset_dir / src_filename)) + function_to_analyze = row[1]['invocation_method_name'] + for class_decl in ast.get_proxy_nodes(ASTNodeType.CLASS_DECLARATION): + # class_ast = ast.get_subtree(class_decl) + if class_decl.name != class_name: + continue + elif class_decl.name == class_name: + for method_decl in class_decl.methods: + if method_decl.name != function_to_analyze: + continue + try: + # print( + # f'Trying analyze {class_decl.name} {method_decl.name} ' + # f'{iteration_number}/{total_number}') + opport = _print_extraction_opportunities( + ast.get_subtree(method_decl) + ) + if opport: + best_group = opport[0] + lines = [node.line for node in best_group._optimal_opportunity] + start_line_opportunity = min(lines) + end_line_opportunity = max(lines) + lines_intersected = set( + range(end_line_of_invocation_occurred, end_line_of_invocation_occurred)) \ + & set(lines) + + if (start_line_of_invocation_occurred == start_line_opportunity) \ + and (end_line_of_invocation_occurred == end_line_opportunity): + matched_cases += 1 + matched_percent += float(len(lines_intersected)) / len(lines) + else: + no_opportunity_chosen += 0 + # print(class_decl.name, method_decl.name) + + except Exception as e: + import traceback + print(src_filename) + traceback.print_exc() + failed_cases_in_SEMI_algorithm += 1 + + break + break + + except Exception as e: + failed_cases_in_validation_examples += 1 + + print(f'Failed SEMI algorithm errors: {failed_cases_in_SEMI_algorithm}') + print(f'Failed examples of synth dataset: {failed_cases_in_validation_examples}') + print(f'matched_cases: {matched_cases}') + print(f'No opportunity chosen: {no_opportunity_chosen} times') + print(f'Total number of cases: {total_number}') + print(f'Total number of matched lines: {matched_percent}') + matched = (matched_cases + no_opportunity_chosen) + total_number = total_number - failed_cases_in_SEMI_algorithm - failed_cases_in_validation_examples + print(float(matched) / total_number) From b6870b5d5734e03dc4b272b9fc213285c654391c Mon Sep 17 00:00:00 2001 From: Evgeny Maslov Date: Tue, 27 Oct 2020 14:42:20 +0300 Subject: [PATCH 08/17] Multithreaded --- veniq/dataset_collection/validation.py | 180 +++++++++++++++++-------- 1 file changed, 122 insertions(+), 58 deletions(-) diff --git a/veniq/dataset_collection/validation.py b/veniq/dataset_collection/validation.py index 5a7d9a37..05f2f83c 100644 --- a/veniq/dataset_collection/validation.py +++ b/veniq/dataset_collection/validation.py @@ -1,7 +1,13 @@ +import os from argparse import ArgumentParser +from collections import namedtuple +from functools import partial from pathlib import Path +from typing import Tuple import pandas as pd +from pebble import ProcessPool +from tqdm import tqdm from veniq.baselines.semi.create_extraction_opportunities import create_extraction_opportunities from veniq.baselines.semi.extract_semantic import extract_method_statements_semantic @@ -37,6 +43,96 @@ def _print_extraction_opportunities( return extraction_opportunities_groups +Stats = namedtuple( + 'Stats', + ['matched_cases', + 'failed_cases_in_SEMI_algorithm', + 'no_opportunity_chosen', + 'matched_percent', + 'failed_cases_in_validation_examples' + ]) + + +def validate_row(dataset_dir: Path, row: pd.Series) -> Tuple[bool, Stats]: + """ + Validate row of dataset + + :param dataset_dir: directory to dataset, path before the relative path in + output_filename + :param row: row of dataframe + :return: boolean value whether we should consider this row or skip it, + Stats - return collected stats + """ + stats = Stats(0, 0, 0, 0, 0) + try: + start_line_of_invocation_occurred = row[1]['start_line_of_function_where_invocation_occurred'] + start_line_of_invoked_function = row[1]['invocation_method_start_line'] + end_line_of_invoked_function = row[1]['invocation_method_end_line'] + end_line_of_invocation_occurred = \ + start_line_of_invocation_occurred + end_line_of_invoked_function - start_line_of_invoked_function + + src_filename = row[1]['output_filename'] + class_name = row[1]['class_name'] + + ast = AST.build_from_javalang(build_ast(dataset_dir / src_filename)) + function_to_analyze = row[1]['method_where_invocation_occurred'] + for class_decl in ast.get_proxy_nodes(ASTNodeType.CLASS_DECLARATION): + # class_ast = ast.get_subtree(class_decl) + if class_decl.name != class_name: + continue + elif class_decl.name == class_name: + objects_to_consider = list(class_decl.methods) + list(class_decl.constructors) or [] + for method_decl in objects_to_consider: + if method_decl.name != function_to_analyze: + continue + try: + opport = _print_extraction_opportunities( + ast.get_subtree(method_decl) + ) + if opport: + best_group = opport[0] + lines = [node.line for node in best_group._optimal_opportunity] + start_line_opportunity = min(lines) + end_line_opportunity = max(lines) + lines_intersected = set( + range(start_line_of_invocation_occurred, end_line_of_invocation_occurred)) \ + & set(lines) + input_f = row[1]['output_filename'] + print( + f'{input_f} {class_decl.name} {method_decl.name}: ' + f'inserted lines: {start_line_of_invocation_occurred}, {end_line_of_invocation_occurred};' + f'opportunity chosen: {start_line_opportunity}, {end_line_opportunity}') + + if (start_line_of_invocation_occurred == start_line_opportunity) \ + and (end_line_of_invocation_occurred == end_line_opportunity): + stats._replace(matched_percent=stats.matched_cases + 1) + updated_percent_value = stats.matched_percent + float(len(lines_intersected)) / len(lines) + stats._replace(matched_percent=updated_percent_value) + else: + stats._replace(no_opportunity_chosen=1) + # print(class_decl.name, method_decl.name) + + except Exception as e: + import traceback + print(src_filename) + # traceback.print_exc() + stats._replace(failed_cases_in_SEMI_algorithm=stats.failed_cases_in_SEMI_algorithm + 1) + + break + break + + except Exception as e: + import traceback + # traceback.print_exc() + stats._replace(failed_cases_in_validation_examples=stats.failed_cases_in_validation_examples + 1) + + # smth bad happened + if stats.failed_cases_in_validation_examples: + return False, stats + else: + return True, stats + + if __name__ == '__main__': parser = ArgumentParser() parser.add_argument( @@ -48,6 +144,16 @@ def _print_extraction_opportunities( "-i", "--csv_input", help="Path for csv" ) + system_cores_qty = os.cpu_count() or 1 + parser.add_argument( + "--jobs", + "-j", + type=int, + default=system_cores_qty - 1, + help="Number of processes to spawn. " + "By default one less than number of cores. " + "Be careful to raise it above, machine may stop responding while creating dataset.", + ) args = parser.parse_args() dataset_dir = Path(args.dataset_dir) csv_dataset_filename = Path(args.csv_input) @@ -67,64 +173,22 @@ def _print_extraction_opportunities( # print(body_start_line, body_end_line) iteration_number = 0 - for row in df_is_parsed.iterrows(): - iteration_number += 1 - start_line_of_invocation_occurred = row[1]['start_line_of_function_where_invocation_occurred'] - start_line_of_invoked_function = row[1]['invocation_method_start_line'] - end_line_of_invoked_function = row[1]['invocation_method_end_line'] - end_line_of_invocation_occurred = end_line_of_invoked_function - start_line_of_invoked_function - lines_inserted = end_line_of_invocation_occurred - start_line_of_invocation_occurred - if lines_inserted >= 1: - continue - - src_filename = row[1]['output_filename'] - class_name = row[1]['class_name'] - try: - ast = AST.build_from_javalang(build_ast(dataset_dir / src_filename)) - function_to_analyze = row[1]['invocation_method_name'] - for class_decl in ast.get_proxy_nodes(ASTNodeType.CLASS_DECLARATION): - # class_ast = ast.get_subtree(class_decl) - if class_decl.name != class_name: - continue - elif class_decl.name == class_name: - for method_decl in class_decl.methods: - if method_decl.name != function_to_analyze: - continue - try: - # print( - # f'Trying analyze {class_decl.name} {method_decl.name} ' - # f'{iteration_number}/{total_number}') - opport = _print_extraction_opportunities( - ast.get_subtree(method_decl) - ) - if opport: - best_group = opport[0] - lines = [node.line for node in best_group._optimal_opportunity] - start_line_opportunity = min(lines) - end_line_opportunity = max(lines) - lines_intersected = set( - range(end_line_of_invocation_occurred, end_line_of_invocation_occurred)) \ - & set(lines) - - if (start_line_of_invocation_occurred == start_line_opportunity) \ - and (end_line_of_invocation_occurred == end_line_opportunity): - matched_cases += 1 - matched_percent += float(len(lines_intersected)) / len(lines) - else: - no_opportunity_chosen += 0 - # print(class_decl.name, method_decl.name) - - except Exception as e: - import traceback - print(src_filename) - traceback.print_exc() - failed_cases_in_SEMI_algorithm += 1 - - break - break - - except Exception as e: - failed_cases_in_validation_examples += 1 + with ProcessPool(1) as executor: + rows_list = list(df_is_parsed.iterrows()) + validate_row_f = partial(validate_row, dataset_dir) + future = executor.map(validate_row_f, rows_list, timeout=1000, ) + result = future.result() + for index, row in tqdm(rows_list): + try: + should_include_in_results, stats = next(result) + if should_include_in_results: + matched_cases += stats.matched_cases + failed_cases_in_SEMI_algorithm += stats.failed_cases_in_SEMI_algorithm + failed_cases_in_validation_examples += stats.failed_cases_in_validation_examples + no_opportunity_chosen += stats.no_opportunity_chosen + matched_percent += stats.matched_percent + except Exception: + continue print(f'Failed SEMI algorithm errors: {failed_cases_in_SEMI_algorithm}') print(f'Failed examples of synth dataset: {failed_cases_in_validation_examples}') From 3af0cbaa5406b7962915146043bf50d9ec111f42 Mon Sep 17 00:00:00 2001 From: Evgeny Maslov Date: Thu, 29 Oct 2020 18:54:05 +0300 Subject: [PATCH 09/17] 1 wroking version --- veniq/dataset_collection/validation.py | 204 +++++++++++++------------ 1 file changed, 108 insertions(+), 96 deletions(-) diff --git a/veniq/dataset_collection/validation.py b/veniq/dataset_collection/validation.py index 05f2f83c..412e136b 100644 --- a/veniq/dataset_collection/validation.py +++ b/veniq/dataset_collection/validation.py @@ -3,12 +3,14 @@ from collections import namedtuple from functools import partial from pathlib import Path -from typing import Tuple - +from typing import Tuple, List +import traceback import pandas as pd +from numpy import mean from pebble import ProcessPool from tqdm import tqdm +from metrics.ncss.ncss import NCSSMetric from veniq.baselines.semi.create_extraction_opportunities import create_extraction_opportunities from veniq.baselines.semi.extract_semantic import extract_method_statements_semantic from veniq.baselines.semi.filter_extraction_opportunities import filter_extraction_opportunities @@ -18,8 +20,10 @@ from veniq.ast_framework import AST, ASTNodeType from random import choice +from dataclasses import make_dataclass as md, dataclass, asdict + -def _print_extraction_opportunities( +def find_extraction_opportunities( method_ast: AST): statements_semantic = extract_method_statements_semantic(method_ast) extraction_opportunities = create_extraction_opportunities(statements_semantic) @@ -30,30 +34,30 @@ def _print_extraction_opportunities( statements_semantic, filtered_extraction_opportunities ) - # print( - # f"Extraction opportunities groups of method {method_name} in class {class_name} in file {filepath}:" - # ) - - # for extraction_opportunity_group in extraction_opportunities_groups: - # print(f"\tExtraction opportunities group with scope {extraction_opportunity_group.benifit}:") - # for extraction_opportunity, benifit in extraction_opportunity_group.opportunities: - # print(f"\t\tExtraction opportunity with score {benifit}:") - # for statement in extraction_opportunity: - # print(f"\t\t\t{statement.node_type} on line {statement.line}") return extraction_opportunities_groups -Stats = namedtuple( - 'Stats', - ['matched_cases', - 'failed_cases_in_SEMI_algorithm', - 'no_opportunity_chosen', - 'matched_percent', - 'failed_cases_in_validation_examples' - ]) - - -def validate_row(dataset_dir: Path, row: pd.Series) -> Tuple[bool, Stats]: +@dataclass +class MatchedResult: + output_filename: str + input_filename: str + start_line_SEMI: int + end_line_SEMI: int + start_line_dataset: int + end_line_dataset: int + percent_matched: float + class_name: str + method_name: str + error_string: str + ncss: int + matched: bool + failed_cases_in_SEMI_algorithm: bool + no_opportunity_chosen: bool + failed_cases_in_validation_examples: bool + + +def validate_row(dataset_dir: Path, row: pd.Series) \ + -> List[MatchedResult]: """ Validate row of dataset @@ -63,74 +67,86 @@ def validate_row(dataset_dir: Path, row: pd.Series) -> Tuple[bool, Stats]: :return: boolean value whether we should consider this row or skip it, Stats - return collected stats """ - stats = Stats(0, 0, 0, 0, 0) + results = [] try: - start_line_of_invocation_occurred = row[1]['start_line_of_function_where_invocation_occurred'] - start_line_of_invoked_function = row[1]['invocation_method_start_line'] - end_line_of_invoked_function = row[1]['invocation_method_end_line'] - end_line_of_invocation_occurred = \ - start_line_of_invocation_occurred + end_line_of_invoked_function - start_line_of_invoked_function + start_line_of_inserted_block = int(row[1]['inline_insertion_line_start']) + end_line_of_inserted_block = int(row[1]['inline_insertion_line_end']) src_filename = row[1]['output_filename'] class_name = row[1]['class_name'] - - ast = AST.build_from_javalang(build_ast(dataset_dir / src_filename)) + full_path = dataset_dir / src_filename + ast = AST.build_from_javalang(build_ast(full_path)) function_to_analyze = row[1]['method_where_invocation_occurred'] + for class_decl in ast.get_proxy_nodes(ASTNodeType.CLASS_DECLARATION): # class_ast = ast.get_subtree(class_decl) if class_decl.name != class_name: continue elif class_decl.name == class_name: objects_to_consider = list(class_decl.methods) + list(class_decl.constructors) or [] - for method_decl in objects_to_consider: - if method_decl.name != function_to_analyze: + for ast_node in objects_to_consider: + result = MatchedResult( + output_filename=full_path, + input_filename=row[1]['input_filename'], + class_name='', + method_name='', + start_line_SEMI=-1, + end_line_SEMI=-1, + start_line_dataset=start_line_of_inserted_block, + end_line_dataset=end_line_of_inserted_block, + percent_matched=-1.0, + error_string='', + ncss=0, + matched=False, + failed_cases_in_SEMI_algorithm=False, + no_opportunity_chosen=False, + failed_cases_in_validation_examples=False, + ) + if ast_node.name != function_to_analyze: continue try: - opport = _print_extraction_opportunities( - ast.get_subtree(method_decl) - ) + ast_subtree = ast.get_subtree(ast_node) + opport = find_extraction_opportunities(ast_subtree) if opport: best_group = opport[0] lines = [node.line for node in best_group._optimal_opportunity] start_line_opportunity = min(lines) end_line_opportunity = max(lines) - lines_intersected = set( - range(start_line_of_invocation_occurred, end_line_of_invocation_occurred)) \ - & set(lines) - input_f = row[1]['output_filename'] - print( - f'{input_f} {class_decl.name} {method_decl.name}: ' - f'inserted lines: {start_line_of_invocation_occurred}, {end_line_of_invocation_occurred};' - f'opportunity chosen: {start_line_opportunity}, {end_line_opportunity}') - - if (start_line_of_invocation_occurred == start_line_opportunity) \ - and (end_line_of_invocation_occurred == end_line_opportunity): - stats._replace(matched_percent=stats.matched_cases + 1) - updated_percent_value = stats.matched_percent + float(len(lines_intersected)) / len(lines) - stats._replace(matched_percent=updated_percent_value) + dataset_range_extraction = range(start_line_of_inserted_block, end_line_of_inserted_block) + lines_intersected = set(dataset_range_extraction) & set(lines) + result.class_name = class_decl.name + result.method_name = ast_node.name + result.start_line_SEMI = start_line_opportunity + result.end_line_SEMI = end_line_opportunity + result.ncss = NCSSMetric().value(ast_subtree) + + if (start_line_of_inserted_block == start_line_opportunity) \ + and (end_line_of_inserted_block == end_line_opportunity): + result.matched = True + + result.percent_matched = float(len(lines_intersected)) / len(dataset_range_extraction) else: - stats._replace(no_opportunity_chosen=1) + result.no_opportunity_chosen = True # print(class_decl.name, method_decl.name) except Exception as e: - import traceback - print(src_filename) - # traceback.print_exc() - stats._replace(failed_cases_in_SEMI_algorithm=stats.failed_cases_in_SEMI_algorithm + 1) + traceback.print_exc() + result.error_string = str(e) + result.failed_cases_in_SEMI_algorithm = True + finally: + results.append(result) break break except Exception as e: - import traceback - # traceback.print_exc() - stats._replace(failed_cases_in_validation_examples=stats.failed_cases_in_validation_examples + 1) + traceback.print_exc() + result.error_string = str(e) + result.failed_cases_in_validation_examples = True + results.append(result) - # smth bad happened - if stats.failed_cases_in_validation_examples: - return False, stats - else: - return True, stats + # print(dataset_dir / src_filename) + return results if __name__ == '__main__': @@ -158,44 +174,40 @@ def validate_row(dataset_dir: Path, row: pd.Series) -> Tuple[bool, Stats]: dataset_dir = Path(args.dataset_dir) csv_dataset_filename = Path(args.csv_input) df = pd.read_csv(csv_dataset_filename) - df_is_parsed = df[df['can_be_parsed']] - failed_cases_in_SEMI_algorithm = 0 - failed_cases_in_validation_examples = 0 - matched_cases = 0 - no_opportunity_chosen = 0 - total_number = df.shape[0] - matched_percent = 0 - # f = r'D:\temp\dataset_colelction_refactoring\small_dataset\output_files\SecurityConstraintPanel_setValue_192.java' - # ast = AST.build_from_javalang(build_ast(f)) - # class_t = [x for x in ast.get_proxy_nodes(ASTNodeType.CLASS_DECLARATION) if x.name == 'SecurityConstraintPanel'][0] - # method_decl = [x for x in ast.get_proxy_nodes(ASTNodeType.METHOD_DECLARATION) if x.name == 'refillUserDataConstraint'][0] - # body_start_line, body_end_line = method_body_lines(method_decl, f) - # print(body_start_line, body_end_line) - iteration_number = 0 - - with ProcessPool(1) as executor: - rows_list = list(df_is_parsed.iterrows()) + df = df[df['can_be_parsed']] + + output_df = pd.DataFrame(columns=list(MatchedResult.__annotations__.keys())) + + with ProcessPool(system_cores_qty) as executor: validate_row_f = partial(validate_row, dataset_dir) - future = executor.map(validate_row_f, rows_list, timeout=1000, ) + future = executor.map(validate_row_f, df.iterrows(), timeout=10000, ) result = future.result() - for index, row in tqdm(rows_list): + for index, row in tqdm(df.iterrows()): try: - should_include_in_results, stats = next(result) - if should_include_in_results: - matched_cases += stats.matched_cases - failed_cases_in_SEMI_algorithm += stats.failed_cases_in_SEMI_algorithm - failed_cases_in_validation_examples += stats.failed_cases_in_validation_examples - no_opportunity_chosen += stats.no_opportunity_chosen - matched_percent += stats.matched_percent - except Exception: + # print(row['input_filename']) + results: List[MatchedResult] = next(result) + for res in results: + output_df = output_df.append(asdict(res), ignore_index=True) + output_df.to_csv('matched.csv') + except Exception as e: + # print(f"Exception inside thread happened: {str(e)}") + print(traceback.format_exc()) continue + matched_cases = float(output_df[output_df["matched"]].shape[0]) + failed_cases_in_SEMI_algorithm = output_df[output_df["failed_cases_in_SEMI_algorithm"]].shape[0] + failed_cases_in_validation_examples = output_df[output_df["failed_cases_in_validation_examples"]].shape[0] + no_opportunity_chosen = output_df[output_df["no_opportunity_chosen"]].shape[0] + matched_percent = mean(output_df[output_df["percent_matched"] > 0].percent_matched.values) print(f'Failed SEMI algorithm errors: {failed_cases_in_SEMI_algorithm}') print(f'Failed examples of synth dataset: {failed_cases_in_validation_examples}') print(f'matched_cases: {matched_cases}') print(f'No opportunity chosen: {no_opportunity_chosen} times') - print(f'Total number of cases: {total_number}') - print(f'Total number of matched lines: {matched_percent}') - matched = (matched_cases + no_opportunity_chosen) - total_number = total_number - failed_cases_in_SEMI_algorithm - failed_cases_in_validation_examples - print(float(matched) / total_number) + print(f'Total number of handled cases: {output_df.shape[0]}') + print(f'Average of matched lines: {matched_percent}') + total_case_handled = output_df.shape[0] - \ + failed_cases_in_SEMI_algorithm - \ + failed_cases_in_validation_examples + if total_case_handled > 0: + result = matched_cases / total_case_handled + print(f'Matched {result}% of cases, {matched_cases} out of {total_case_handled}') From 00c914f078b7920300d0ac5ca05b376360ff5990 Mon Sep 17 00:00:00 2001 From: Evgeny Maslov Date: Fri, 30 Oct 2020 13:49:21 +0300 Subject: [PATCH 10/17] Add tests and fixed_line_function --- test/validation/DynaMenuModel.java | 111 ++++++++++++ test/validation/NameNodeRpcServer.java | 239 +++++++++++++++++++++++++ test/validation/User.java | 24 +++ test/validation/__init__.py | 0 test/validation/test_validation.py | 39 ++++ veniq/dataset_collection/validation.py | 77 +++++--- 6 files changed, 468 insertions(+), 22 deletions(-) create mode 100644 test/validation/DynaMenuModel.java create mode 100644 test/validation/NameNodeRpcServer.java create mode 100644 test/validation/User.java create mode 100644 test/validation/__init__.py create mode 100644 test/validation/test_validation.py diff --git a/test/validation/DynaMenuModel.java b/test/validation/DynaMenuModel.java new file mode 100644 index 00000000..39ff1f13 --- /dev/null +++ b/test/validation/DynaMenuModel.java @@ -0,0 +1,111 @@ +package org.openide.awt; +import java.awt.Component; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import javax.swing.Action; +import javax.swing.Icon; +import javax.swing.ImageIcon; +import javax.swing.JComponent; +import javax.swing.JMenu; +import javax.swing.JMenuItem; +import javax.swing.JPopupMenu; +import javax.swing.JSeparator; +import javax.swing.UIManager; +import org.openide.filesystems.FileObject; +import org.openide.util.ImageUtilities; +import org.openide.util.Utilities; +import org.openide.util.actions.Presenter; +class DynaMenuModel { + private static final Icon BLANK_ICON = new ImageIcon(ImageUtilities.loadImage("org/openide/loaders/empty.gif")); + private List menuItems; + private HashMap actionToMenuMap; + private boolean isWithIcons = false; + public DynaMenuModel() { + actionToMenuMap = new HashMap(); + } + public void loadSubmenu(List cInstances, JMenu m, boolean remove, Map cookiesToFiles) { + boolean addSeparator = false; + Icon curIcon = null; + Iterator it = cInstances.iterator(); + menuItems = new ArrayList(cInstances.size()); + actionToMenuMap.clear(); + while (it.hasNext()) { + Object obj = it.next(); + if (obj instanceof Action) { + FileObject file = cookiesToFiles.get(obj); + if (file != null) { + AcceleratorBinding.setAccelerator((Action) obj, file); + } + } + if (obj instanceof Presenter.Menu) { + obj = ((Presenter.Menu)obj).getMenuPresenter(); + } + if (obj instanceof DynamicMenuContent) { + if(addSeparator) { + menuItems.add(null); + addSeparator = false; + } + DynamicMenuContent mn = (DynamicMenuContent)obj; + JComponent[] itms = convertArray(mn.getMenuPresenters()); + actionToMenuMap.put(mn, itms); + Iterator itx = Arrays.asList(itms).iterator(); + while (itx.hasNext()) { + JComponent comp = (JComponent)itx.next(); + menuItems.add(comp); + isWithIcons = checkIcon(comp, isWithIcons); + } + continue; + } + if (obj instanceof JMenuItem) { + if(addSeparator) { + menuItems.add(null); + addSeparator = false; + } + isWithIcons = checkIcon(obj, isWithIcons); + menuItems.add((JMenuItem)obj); + } else if (obj instanceof JSeparator) { + addSeparator = menuItems.size() > 0; + } else if (obj instanceof Action) { + if(addSeparator) { + menuItems.add(null); + addSeparator = false; + } + Action a = (Action)obj; + Actions.MenuItem item = new Actions.MenuItem(a, true); + isWithIcons = checkIcon(item, isWithIcons); + actionToMenuMap.put(item, new JComponent[] {item}); + menuItems.add(item); + } + } + if (isWithIcons) { + menuItems = alignVertically(menuItems); + } + if (remove) { + m.removeAll(); + } + JComponent curItem = null; + boolean wasSeparator = false; + for (Iterator iter = menuItems.iterator(); iter.hasNext(); ) { + curItem = iter.next(); + if (curItem == null) { + JMenu menu = new JMenu(); + menu.addSeparator(); + curItem = (JSeparator)menu.getPopupMenu().getComponent(0); + } + m.add(curItem); + boolean isSeparator = curItem instanceof JSeparator; + if (isSeparator && wasSeparator) { + curItem.setVisible(false); + } + if (!(curItem instanceof InvisibleMenuItem)) { + wasSeparator = isSeparator; + } + } + } + +} \ No newline at end of file diff --git a/test/validation/NameNodeRpcServer.java b/test/validation/NameNodeRpcServer.java new file mode 100644 index 00000000..41821188 --- /dev/null +++ b/test/validation/NameNodeRpcServer.java @@ -0,0 +1,239 @@ +package org.apache.hadoop.hdfs.server.namenode; +import static org.apache.hadoop.fs.CommonConfigurationKeys.IPC_MAXIMUM_DATA_LENGTH; +import static org.apache.hadoop.fs.CommonConfigurationKeys.IPC_MAXIMUM_DATA_LENGTH_DEFAULT; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_HANDLER_COUNT_DEFAULT; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_HANDLER_COUNT_KEY; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_LIFELINE_HANDLER_COUNT_KEY; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_LIFELINE_HANDLER_RATIO_DEFAULT; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_LIFELINE_HANDLER_RATIO_KEY; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SERVICE_HANDLER_COUNT_DEFAULT; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SERVICE_HANDLER_COUNT_KEY; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RPC_ADDRESS_AUXILIARY_KEY; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_STATE_CONTEXT_ENABLED_DEFAULT; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_STATE_CONTEXT_ENABLED_KEY; +import static org.apache.hadoop.hdfs.server.common.HdfsServerConstants.MAX_PATH_DEPTH; +import static org.apache.hadoop.hdfs.server.common.HdfsServerConstants.MAX_PATH_LENGTH; +import static org.apache.hadoop.util.Time.now; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.net.InetSocketAddress; +import java.security.PrivilegedExceptionAction; +import java.util.Arrays; +import java.util.Collection; +import java.util.EnumSet; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import com.google.common.collect.Lists; +import org.apache.hadoop.HadoopIllegalArgumentException; +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.ReconfigurationTaskStatus; +import org.apache.hadoop.crypto.CryptoProtocolVersion; +import org.apache.hadoop.fs.BatchedRemoteIterator.BatchedEntries; +import org.apache.hadoop.hdfs.AddBlockFlag; +import org.apache.hadoop.fs.CacheFlag; +import org.apache.hadoop.fs.CommonConfigurationKeys; +import org.apache.hadoop.fs.ContentSummary; +import org.apache.hadoop.fs.CreateFlag; +import org.apache.hadoop.fs.FileAlreadyExistsException; +import org.apache.hadoop.fs.FsServerDefaults; +import org.apache.hadoop.fs.InvalidPathException; +import org.apache.hadoop.fs.Options; +import org.apache.hadoop.fs.ParentNotDirectoryException; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.StorageType; +import org.apache.hadoop.fs.UnresolvedLinkException; +import org.apache.hadoop.fs.XAttr; +import org.apache.hadoop.fs.XAttrSetFlag; +import org.apache.hadoop.fs.permission.AclEntry; +import org.apache.hadoop.fs.permission.AclStatus; +import org.apache.hadoop.fs.permission.FsPermission; +import org.apache.hadoop.fs.permission.FsAction; +import org.apache.hadoop.fs.permission.PermissionStatus; +import org.apache.hadoop.fs.QuotaUsage; +import org.apache.hadoop.ha.HAServiceStatus; +import org.apache.hadoop.ha.HealthCheckFailedException; +import org.apache.hadoop.ha.ServiceFailedException; +import org.apache.hadoop.ha.proto.HAServiceProtocolProtos.HAServiceProtocolService; +import org.apache.hadoop.ha.protocolPB.HAServiceProtocolPB; +import org.apache.hadoop.ha.protocolPB.HAServiceProtocolServerSideTranslatorPB; +import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.hdfs.DFSUtil; +import org.apache.hadoop.hdfs.DFSUtilClient; +import org.apache.hadoop.hdfs.HDFSPolicyProvider; +import org.apache.hadoop.hdfs.inotify.EventBatch; +import org.apache.hadoop.hdfs.inotify.EventBatchList; +import org.apache.hadoop.hdfs.protocol.AclException; +import org.apache.hadoop.hdfs.protocol.AddErasureCodingPolicyResponse; +import org.apache.hadoop.hdfs.protocol.AlreadyBeingCreatedException; +import org.apache.hadoop.hdfs.protocol.BatchedDirectoryListing; +import org.apache.hadoop.hdfs.protocol.BlockListAsLongs; +import org.apache.hadoop.hdfs.protocol.BlockStoragePolicy; +import org.apache.hadoop.hdfs.protocol.CacheDirectiveEntry; +import org.apache.hadoop.hdfs.protocol.CacheDirectiveInfo; +import org.apache.hadoop.hdfs.protocol.CachePoolEntry; +import org.apache.hadoop.hdfs.protocol.CachePoolInfo; +import org.apache.hadoop.hdfs.protocol.CorruptFileBlocks; +import org.apache.hadoop.hdfs.protocol.DSQuotaExceededException; +import org.apache.hadoop.hdfs.protocol.DatanodeID; +import org.apache.hadoop.hdfs.protocol.DatanodeInfo; +import org.apache.hadoop.hdfs.protocol.DirectoryListing; +import org.apache.hadoop.hdfs.protocol.ECBlockGroupStats; +import org.apache.hadoop.hdfs.protocol.ECTopologyVerifierResult; +import org.apache.hadoop.hdfs.protocol.ErasureCodingPolicy; +import org.apache.hadoop.hdfs.protocol.EncryptionZone; +import org.apache.hadoop.hdfs.protocol.ErasureCodingPolicyInfo; +import org.apache.hadoop.hdfs.protocol.ExtendedBlock; +import org.apache.hadoop.hdfs.protocol.FSLimitException; +import org.apache.hadoop.hdfs.protocol.HdfsLocatedFileStatus; +import org.apache.hadoop.hdfs.protocol.HdfsPartialListing; +import org.apache.hadoop.hdfs.protocol.LastBlockWithStatus; +import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType; +import org.apache.hadoop.hdfs.protocol.HdfsConstants.ReencryptAction; +import org.apache.hadoop.hdfs.protocol.HdfsConstants.RollingUpgradeAction; +import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction; +import org.apache.hadoop.hdfs.protocol.HdfsConstants.StoragePolicySatisfierMode; +import org.apache.hadoop.hdfs.protocol.HdfsFileStatus; +import org.apache.hadoop.hdfs.protocol.LocatedBlock; +import org.apache.hadoop.hdfs.protocol.LocatedBlocks; +import org.apache.hadoop.hdfs.protocol.NSQuotaExceededException; +import org.apache.hadoop.hdfs.protocol.OpenFileEntry; +import org.apache.hadoop.hdfs.protocol.OpenFilesIterator; +import org.apache.hadoop.hdfs.protocol.OpenFilesIterator.OpenFilesType; +import org.apache.hadoop.hdfs.protocol.QuotaByStorageTypeExceededException; +import org.apache.hadoop.hdfs.protocol.QuotaExceededException; +import org.apache.hadoop.hdfs.protocol.RecoveryInProgressException; +import org.apache.hadoop.hdfs.protocol.ReplicatedBlockStats; +import org.apache.hadoop.hdfs.protocol.ZoneReencryptionStatus; +import org.apache.hadoop.hdfs.protocol.RollingUpgradeInfo; +import org.apache.hadoop.hdfs.protocol.SnapshotDiffReport; +import org.apache.hadoop.hdfs.protocol.SnapshotDiffReportListing; +import org.apache.hadoop.hdfs.protocol.SnapshottableDirectoryStatus; +import org.apache.hadoop.hdfs.protocol.UnregisteredNodeException; +import org.apache.hadoop.hdfs.protocol.UnresolvedPathException; +import org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos.ClientNamenodeProtocol; +import org.apache.hadoop.hdfs.protocol.proto.DatanodeLifelineProtocolProtos.DatanodeLifelineProtocolService; +import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.DatanodeProtocolService; +import org.apache.hadoop.hdfs.protocol.proto.NamenodeProtocolProtos.NamenodeProtocolService; +import org.apache.hadoop.hdfs.protocol.proto.ReconfigurationProtocolProtos.ReconfigurationProtocolService; +import org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolPB; +import org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB; +import org.apache.hadoop.hdfs.protocolPB.DatanodeLifelineProtocolPB; +import org.apache.hadoop.hdfs.protocolPB.DatanodeLifelineProtocolServerSideTranslatorPB; +import org.apache.hadoop.hdfs.protocolPB.DatanodeProtocolPB; +import org.apache.hadoop.hdfs.protocolPB.DatanodeProtocolServerSideTranslatorPB; +import org.apache.hadoop.hdfs.protocolPB.NamenodeProtocolPB; +import org.apache.hadoop.hdfs.protocolPB.NamenodeProtocolServerSideTranslatorPB; +import org.apache.hadoop.hdfs.protocolPB.ReconfigurationProtocolPB; +import org.apache.hadoop.hdfs.protocolPB.ReconfigurationProtocolServerSideTranslatorPB; +import org.apache.hadoop.hdfs.security.token.block.DataEncryptionKey; +import org.apache.hadoop.hdfs.security.token.block.ExportedBlockKeys; +import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier; +import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager; +import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerFaultInjector; +import org.apache.hadoop.hdfs.server.common.HdfsServerConstants; +import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NamenodeRole; +import org.apache.hadoop.hdfs.server.common.HttpGetFailedException; +import org.apache.hadoop.hdfs.server.common.IncorrectVersionException; +import org.apache.hadoop.hdfs.server.namenode.NameNode.OperationCategory; +import org.apache.hadoop.hdfs.server.namenode.metrics.NameNodeMetrics; +import org.apache.hadoop.hdfs.server.namenode.sps.StoragePolicySatisfyManager; +import org.apache.hadoop.hdfs.server.protocol.BlockReportContext; +import org.apache.hadoop.hdfs.server.protocol.BlocksWithLocations; +import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand; +import org.apache.hadoop.hdfs.server.protocol.DatanodeProtocol; +import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration; +import org.apache.hadoop.hdfs.server.protocol.DatanodeStorageReport; +import org.apache.hadoop.hdfs.server.protocol.FinalizeCommand; +import org.apache.hadoop.hdfs.server.protocol.HeartbeatResponse; +import org.apache.hadoop.hdfs.server.protocol.NamenodeCommand; +import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocols; +import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration; +import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo; +import org.apache.hadoop.hdfs.server.protocol.NodeRegistration; +import org.apache.hadoop.hdfs.server.protocol.RegisterCommand; +import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest; +import org.apache.hadoop.hdfs.server.protocol.SlowDiskReports; +import org.apache.hadoop.hdfs.server.protocol.SlowPeerReports; +import org.apache.hadoop.hdfs.server.protocol.StorageBlockReport; +import org.apache.hadoop.hdfs.server.protocol.StorageReceivedDeletedBlocks; +import org.apache.hadoop.hdfs.server.protocol.StorageReport; +import org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary; +import org.apache.hadoop.io.EnumSetWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.ipc.ProtobufRpcEngine; +import org.apache.hadoop.ipc.RPC; +import org.apache.hadoop.ipc.RetriableException; +import org.apache.hadoop.ipc.RetryCache; +import org.apache.hadoop.ipc.RetryCache.CacheEntry; +import org.apache.hadoop.ipc.RetryCache.CacheEntryWithPayload; +import org.apache.hadoop.ipc.Server; +import org.apache.hadoop.ipc.StandbyException; +import org.apache.hadoop.ipc.RefreshRegistry; +import org.apache.hadoop.ipc.RefreshResponse; +import org.apache.hadoop.net.Node; +import org.apache.hadoop.security.AccessControlException; +import org.apache.hadoop.security.Groups; +import org.apache.hadoop.security.SecurityUtil; +import org.apache.hadoop.security.UserGroupInformation; +import org.apache.hadoop.security.authorize.AuthorizationException; +import org.apache.hadoop.security.authorize.ProxyUsers; +import org.apache.hadoop.security.proto.RefreshAuthorizationPolicyProtocolProtos.RefreshAuthorizationPolicyProtocolService; +import org.apache.hadoop.security.proto.RefreshUserMappingsProtocolProtos.RefreshUserMappingsProtocolService; +import org.apache.hadoop.security.protocolPB.RefreshAuthorizationPolicyProtocolPB; +import org.apache.hadoop.security.protocolPB.RefreshAuthorizationPolicyProtocolServerSideTranslatorPB; +import org.apache.hadoop.security.protocolPB.RefreshUserMappingsProtocolPB; +import org.apache.hadoop.security.protocolPB.RefreshUserMappingsProtocolServerSideTranslatorPB; +import org.apache.hadoop.ipc.protocolPB.RefreshCallQueueProtocolPB; +import org.apache.hadoop.ipc.protocolPB.RefreshCallQueueProtocolServerSideTranslatorPB; +import org.apache.hadoop.ipc.proto.RefreshCallQueueProtocolProtos.RefreshCallQueueProtocolService; +import org.apache.hadoop.ipc.protocolPB.GenericRefreshProtocolPB; +import org.apache.hadoop.ipc.protocolPB.GenericRefreshProtocolServerSideTranslatorPB; +import org.apache.hadoop.ipc.proto.GenericRefreshProtocolProtos.GenericRefreshProtocolService; +import org.apache.hadoop.security.token.SecretManager.InvalidToken; +import org.apache.hadoop.security.token.Token; +import org.apache.hadoop.tools.proto.GetUserMappingsProtocolProtos.GetUserMappingsProtocolService; +import org.apache.hadoop.tools.protocolPB.GetUserMappingsProtocolPB; +import org.apache.hadoop.tools.protocolPB.GetUserMappingsProtocolServerSideTranslatorPB; +import org.apache.hadoop.tracing.SpanReceiverInfo; +import org.apache.hadoop.tracing.TraceAdminPB.TraceAdminService; +import org.apache.hadoop.tracing.TraceAdminProtocolPB; +import org.apache.hadoop.tracing.TraceAdminProtocolServerSideTranslatorPB; +import org.apache.hadoop.util.VersionInfo; +import org.apache.hadoop.util.VersionUtil; +import org.slf4j.Logger; +import com.google.common.annotations.VisibleForTesting; +import com.google.protobuf.BlockingService; +import javax.annotation.Nonnull; +@InterfaceAudience.Private +@VisibleForTesting +public class NameNodeRpcServer implements NamenodeProtocols { + private static final Logger LOG = NameNode.LOG; + private static final Logger stateChangeLog = NameNode.stateChangeLog; + private static final Logger blockStateChangeLog = NameNode + .blockStateChangeLog; + protected final FSNamesystem namesystem; + protected final NameNode nn; + private final NameNodeMetrics metrics; + private final RetryCache retryCache; + private final boolean serviceAuthEnabled; + private final RPC.Server serviceRpcServer; + private final InetSocketAddress serviceRPCAddress; + private final RPC.Server lifelineRpcServer; + private final InetSocketAddress lifelineRPCAddress; + protected final RPC.Server clientRpcServer; + protected final InetSocketAddress clientRpcAddress; + private final String minimumDataNodeVersion; + private final String defaultECPolicyName; + + public ECBlockGroupStats getECBlockGroupStats() throws IOException { + if (!this.nn.isStarted()) { + String message = NameNode.composeNotStartedMessage(this.nn.getRole()); + throw new RetriableException(message); + } + namesystem.checkOperation(OperationCategory.READ); + return namesystem.getECBlockGroupStats(); + } +} \ No newline at end of file diff --git a/test/validation/User.java b/test/validation/User.java new file mode 100644 index 00000000..47bbe34b --- /dev/null +++ b/test/validation/User.java @@ -0,0 +1,24 @@ +package com.baeldung.constructorsstaticfactorymethods.entities; +import java.time.LocalTime; +import java.util.logging.ConsoleHandler; +import java.util.logging.Level; +import java.util.logging.Logger; +import java.util.logging.SimpleFormatter; +public class User { + private static volatile User instance = null; + private static final Logger LOGGER = Logger.getLogger(User.class.getName()); + private final String name; + private final String email; + private final String country; + public static User createWithDefaultCountry(String name, String email) { + return new User(name, email, "Argentina"); + } + public static User createWithLoggedInstantiationTime(String name, String email, String country) { + ConsoleHandler handler = new ConsoleHandler(); + handler.setLevel(Level.INFO); + handler.setFormatter(new SimpleFormatter()); + LOGGER.addHandler(handler); + LOGGER.log(Level.INFO, "Creating User instance at : {0}", LocalTime.now()); + return new User(name, email, country); + } +} \ No newline at end of file diff --git a/test/validation/__init__.py b/test/validation/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/test/validation/test_validation.py b/test/validation/test_validation.py new file mode 100644 index 00000000..a6002e6e --- /dev/null +++ b/test/validation/test_validation.py @@ -0,0 +1,39 @@ +from pathlib import Path +from unittest import TestCase + +from dataset_collection.validation import find_extraction_opportunities, fix_start_end_lines_for_opportunity +from veniq.ast_framework import AST, ASTNodeType +from veniq.utils.ast_builder import build_ast + + +class TestValidation(TestCase): + folder = Path(__file__).absolute().parent + + def test_validation_semi_2_closing_brackets(self): + file = self.folder / "DynaMenuModel.java" + lines_extracted_by_semi = list(range(91, 107)) + fixed_lines = fix_start_end_lines_for_opportunity( + lines_extracted_by_semi, + str(file) + ) + self.assertEqual(range(91, 109), fixed_lines) + + def test_semi_no_need_to_find_closing_brackets(self): + file = self.folder / "User.java" + lines_extracted_by_semi = list(range(17, 22)) + fixed_lines = fix_start_end_lines_for_opportunity( + lines_extracted_by_semi, + str(file) + ) + self.assertEqual(range(17, 22), fixed_lines) + + def test_validation_semi_1_closing_brackets(self): + file = self.folder / "NameNodeRpcServer.java" + lines_extracted_by_semi = list(range(231, 234)) + fixed_lines = fix_start_end_lines_for_opportunity( + lines_extracted_by_semi, + str(file) + ) + self.assertEqual(range(231, 235), fixed_lines) + + # def test_1_open_bracket(self): diff --git a/veniq/dataset_collection/validation.py b/veniq/dataset_collection/validation.py index 412e136b..abbeccbb 100644 --- a/veniq/dataset_collection/validation.py +++ b/veniq/dataset_collection/validation.py @@ -1,26 +1,25 @@ import os +import traceback from argparse import ArgumentParser -from collections import namedtuple +from dataclasses import dataclass, asdict from functools import partial from pathlib import Path -from typing import Tuple, List -import traceback +from typing import List, Tuple + import pandas as pd from numpy import mean from pebble import ProcessPool from tqdm import tqdm -from metrics.ncss.ncss import NCSSMetric +from veniq.baselines.semi._common_types import ExtractionOpportunity +from veniq.metrics.ncss.ncss import NCSSMetric +from veniq.utils.encoding_detector import read_text_with_autodetected_encoding +from veniq.ast_framework import AST, ASTNodeType from veniq.baselines.semi.create_extraction_opportunities import create_extraction_opportunities from veniq.baselines.semi.extract_semantic import extract_method_statements_semantic from veniq.baselines.semi.filter_extraction_opportunities import filter_extraction_opportunities -from veniq.baselines.semi.rank_extraction_opportunities import rank_extraction_opportunities -from veniq.dataset_collection.augmentation import method_body_lines +from veniq.baselines.semi.rank_extraction_opportunities import rank_extraction_opportunities, ExtractionOpportunityGroup from veniq.utils.ast_builder import build_ast -from veniq.ast_framework import AST, ASTNodeType -from random import choice - -from dataclasses import make_dataclass as md, dataclass, asdict def find_extraction_opportunities( @@ -38,7 +37,7 @@ def find_extraction_opportunities( @dataclass -class MatchedResult: +class RowResult: output_filename: str input_filename: str start_line_SEMI: int @@ -56,8 +55,43 @@ class MatchedResult: failed_cases_in_validation_examples: bool +def fix_start_end_lines_for_opportunity( + extracted_lines_of_opportunity: List[int], + filepath: str) -> range: + """ + Finds start and end lines for opportunity + + :param filepath: filename where opportunity was found + :param extracted_lines_of_opportunity: list of lines for opportunity + :return: list of extracted lines for opportunity + """ + start_line_opportunity = min(extracted_lines_of_opportunity) + end_line_opportunity = max(extracted_lines_of_opportunity) + text = read_text_with_autodetected_encoding(filepath).split('\n') + extraction = text[start_line_opportunity:end_line_opportunity] + open_brackets = 0 + close_brackets = 0 + for x in extraction: + close_brackets += x.count('}') + for x in extraction: + open_brackets += x.count('{') + + if open_brackets < close_brackets: + diff = close_brackets - open_brackets + while diff > 0: + start_line_opportunity -= 1 + diff -= 1 + elif open_brackets > close_brackets: + diff = open_brackets - close_brackets + while diff > 0: + end_line_opportunity += 1 + diff -= 1 + + return range(start_line_opportunity, end_line_opportunity + 1) + + def validate_row(dataset_dir: Path, row: pd.Series) \ - -> List[MatchedResult]: + -> List[RowResult]: """ Validate row of dataset @@ -79,13 +113,12 @@ def validate_row(dataset_dir: Path, row: pd.Series) \ function_to_analyze = row[1]['method_where_invocation_occurred'] for class_decl in ast.get_proxy_nodes(ASTNodeType.CLASS_DECLARATION): - # class_ast = ast.get_subtree(class_decl) if class_decl.name != class_name: continue elif class_decl.name == class_name: objects_to_consider = list(class_decl.methods) + list(class_decl.constructors) or [] for ast_node in objects_to_consider: - result = MatchedResult( + result = RowResult( output_filename=full_path, input_filename=row[1]['input_filename'], class_name='', @@ -110,6 +143,10 @@ def validate_row(dataset_dir: Path, row: pd.Series) \ if opport: best_group = opport[0] lines = [node.line for node in best_group._optimal_opportunity] + # fixed_lines = fix_start_end_lines_for_opportunity( + # lines, + # full_path + # ) start_line_opportunity = min(lines) end_line_opportunity = max(lines) dataset_range_extraction = range(start_line_of_inserted_block, end_line_of_inserted_block) @@ -127,7 +164,6 @@ def validate_row(dataset_dir: Path, row: pd.Series) \ result.percent_matched = float(len(lines_intersected)) / len(dataset_range_extraction) else: result.no_opportunity_chosen = True - # print(class_decl.name, method_decl.name) except Exception as e: traceback.print_exc() @@ -145,7 +181,6 @@ def validate_row(dataset_dir: Path, row: pd.Series) \ result.failed_cases_in_validation_examples = True results.append(result) - # print(dataset_dir / src_filename) return results @@ -176,16 +211,16 @@ def validate_row(dataset_dir: Path, row: pd.Series) \ df = pd.read_csv(csv_dataset_filename) df = df[df['can_be_parsed']] - output_df = pd.DataFrame(columns=list(MatchedResult.__annotations__.keys())) + output_df = pd.DataFrame(columns=list(RowResult.__annotations__.keys())) - with ProcessPool(system_cores_qty) as executor: + with ProcessPool(1) as executor: validate_row_f = partial(validate_row, dataset_dir) future = executor.map(validate_row_f, df.iterrows(), timeout=10000, ) result = future.result() for index, row in tqdm(df.iterrows()): try: # print(row['input_filename']) - results: List[MatchedResult] = next(result) + results: List[RowResult] = next(result) for res in results: output_df = output_df.append(asdict(res), ignore_index=True) output_df.to_csv('matched.csv') @@ -205,9 +240,7 @@ def validate_row(dataset_dir: Path, row: pd.Series) \ print(f'No opportunity chosen: {no_opportunity_chosen} times') print(f'Total number of handled cases: {output_df.shape[0]}') print(f'Average of matched lines: {matched_percent}') - total_case_handled = output_df.shape[0] - \ - failed_cases_in_SEMI_algorithm - \ - failed_cases_in_validation_examples + total_case_handled = output_df.shape[0] - failed_cases_in_SEMI_algorithm - failed_cases_in_validation_examples if total_case_handled > 0: result = matched_cases / total_case_handled print(f'Matched {result}% of cases, {matched_cases} out of {total_case_handled}') From c2f5d539da000112df52294a7d7fb9dd1246425c Mon Sep 17 00:00:00 2001 From: Evgeny Maslov Date: Fri, 30 Oct 2020 13:54:40 +0300 Subject: [PATCH 11/17] Fix lines --- veniq/dataset_collection/validation.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/veniq/dataset_collection/validation.py b/veniq/dataset_collection/validation.py index abbeccbb..5b20a398 100644 --- a/veniq/dataset_collection/validation.py +++ b/veniq/dataset_collection/validation.py @@ -6,6 +6,7 @@ from pathlib import Path from typing import List, Tuple + import pandas as pd from numpy import mean from pebble import ProcessPool @@ -149,7 +150,10 @@ def validate_row(dataset_dir: Path, row: pd.Series) \ # ) start_line_opportunity = min(lines) end_line_opportunity = max(lines) - dataset_range_extraction = range(start_line_of_inserted_block, end_line_of_inserted_block) + dataset_range_extraction = range( + start_line_of_inserted_block, + end_line_of_inserted_block + 1 + ) lines_intersected = set(dataset_range_extraction) & set(lines) result.class_name = class_decl.name result.method_name = ast_node.name From 3eb7f509dcb4ea496b281588cc1ac6d746bf43d8 Mon Sep 17 00:00:00 2001 From: Evgeny Maslov Date: Fri, 30 Oct 2020 17:26:08 +0300 Subject: [PATCH 12/17] New test added and fixed --- test/validation/BaseTextEditor.java | 66 ++++++++++++++++++++++++++ test/validation/CssPreprocessors.java | 43 +++++++++++++++++ test/validation/MetadataEncoder.java | 59 +++++++++++++++++++++++ test/validation/test_validation.py | 45 ++++++++++++++---- veniq/dataset_collection/validation.py | 52 ++++++++++++-------- 5 files changed, 238 insertions(+), 27 deletions(-) create mode 100644 test/validation/BaseTextEditor.java create mode 100644 test/validation/CssPreprocessors.java create mode 100644 test/validation/MetadataEncoder.java diff --git a/test/validation/BaseTextEditor.java b/test/validation/BaseTextEditor.java new file mode 100644 index 00000000..9100d6a9 --- /dev/null +++ b/test/validation/BaseTextEditor.java @@ -0,0 +1,66 @@ +package org.jkiss.dbeaver.ui.editors.text; +import org.eclipse.core.resources.IFile; +import org.eclipse.core.resources.ResourcesPlugin; +import org.eclipse.core.runtime.CoreException; +import org.eclipse.jface.action.GroupMarker; +import org.eclipse.jface.action.IAction; +import org.eclipse.jface.action.IMenuManager; +import org.eclipse.jface.action.Separator; +import org.eclipse.jface.text.IDocument; +import org.eclipse.jface.text.IUndoManager; +import org.eclipse.jface.text.TextViewer; +import org.eclipse.jface.text.source.SourceViewer; +import org.eclipse.swt.SWT; +import org.eclipse.swt.custom.ST; +import org.eclipse.swt.custom.StyledText; +import org.eclipse.swt.widgets.Composite; +import org.eclipse.ui.IEditorInput; +import org.eclipse.ui.IEditorPart; +import org.eclipse.ui.IWorkbenchActionConstants; +import org.eclipse.ui.texteditor.AbstractDecoratedTextEditor; +import org.eclipse.ui.texteditor.IDocumentProvider; +import org.eclipse.ui.texteditor.ITextEditorActionConstants; +import org.jkiss.code.Nullable; +import org.jkiss.dbeaver.runtime.DBWorkbench; +import org.jkiss.dbeaver.ui.ICommentsSupport; +import org.jkiss.dbeaver.ui.ISingleControlEditor; +import org.jkiss.dbeaver.ui.UIUtils; +import org.jkiss.dbeaver.ui.dialogs.DialogUtils; +import org.jkiss.dbeaver.ui.editors.*; +import org.jkiss.dbeaver.utils.ContentUtils; +import org.jkiss.dbeaver.utils.GeneralUtils; +import org.jkiss.utils.IOUtils; +import java.io.*; +import java.lang.reflect.InvocationTargetException; +import java.util.ArrayList; +import java.util.List; +public abstract class BaseTextEditor extends AbstractDecoratedTextEditor implements ISingleControlEditor { + public static final String TEXT_EDITOR_CONTEXT = "org.eclipse.ui.textEditorScope"; + public static final String GROUP_SQL_PREFERENCES = "sql.preferences"; + public static final String GROUP_SQL_ADDITIONS = "sql.additions"; + public static final String GROUP_SQL_EXTRAS = "sql.extras"; + private List actionContributors = new ArrayList<>(); + public void addContextMenuContributor(IActionContributor contributor) { + actionContributors.add(contributor); + } + public static BaseTextEditor getTextEditor(IEditorPart editor) + { + if (editor == null) { + return null; + } + if (editor instanceof BaseTextEditor) { + return (BaseTextEditor) editor; + } + return editor.getAdapter(BaseTextEditor.class); + } + @Override + protected void doSetInput(IEditorInput input) throws CoreException { + if (input != getEditorInput()) { + IEditorInput editorInput = getEditorInput(); + if (editorInput instanceof IStatefulEditorInput) { + ((IStatefulEditorInput) editorInput).release(); + } + } + super.doSetInput(input); + } +} \ No newline at end of file diff --git a/test/validation/CssPreprocessors.java b/test/validation/CssPreprocessors.java new file mode 100644 index 00000000..c768831e --- /dev/null +++ b/test/validation/CssPreprocessors.java @@ -0,0 +1,43 @@ +package org.netbeans.modules.web.common.api; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.concurrent.CopyOnWriteArrayList; +import org.netbeans.api.annotations.common.CheckForNull; +import org.netbeans.api.annotations.common.NonNull; +import org.netbeans.api.annotations.common.NullAllowed; +import org.netbeans.api.project.Project; +import org.netbeans.modules.web.common.cssprep.CssPreprocessorAccessor; +import org.netbeans.modules.web.common.cssprep.CssPreprocessorsAccessor; +import org.netbeans.modules.web.common.spi.CssPreprocessorImplementation; +import org.netbeans.modules.web.common.spi.CssPreprocessorImplementationListener; +import org.openide.filesystems.FileObject; +import org.openide.util.Lookup; +import org.openide.util.LookupEvent; +import org.openide.util.LookupListener; +import org.openide.util.Parameters; +import org.openide.util.RequestProcessor; +import org.openide.util.lookup.Lookups; +public final class CssPreprocessors { + public static final String PREPROCESSORS_PATH = "CSS/PreProcessors"; + private static final RequestProcessor RP = new RequestProcessor(CssPreprocessors.class.getName(), 2); + private static final Lookup.Result PREPROCESSORS = Lookups.forPath(PREPROCESSORS_PATH).lookupResult(CssPreprocessorImplementation.class); + private static final CssPreprocessors INSTANCE = new CssPreprocessors(); + private final List preprocessors = new CopyOnWriteArrayList<>(); + final CssPreprocessorsListener.Support listenersSupport = new CssPreprocessorsListener.Support(); + private final PreprocessorImplementationsListener preprocessorImplementationsListener = new PreprocessorImplementationsListener(); + + void reinitProcessors() { + synchronized (preprocessors) { + clearProcessors(); + assert preprocessors.isEmpty() : "Empty preprocessors expected but: " + preprocessors; + preprocessors.addAll(map(PREPROCESSORS.allInstances())); + for (CssPreprocessor cssPreprocessor : preprocessors) { + cssPreprocessor.getDelegate().addCssPreprocessorListener(preprocessorImplementationsListener); + } + } + listenersSupport.firePreprocessorsChanged(); + } + +} \ No newline at end of file diff --git a/test/validation/MetadataEncoder.java b/test/validation/MetadataEncoder.java new file mode 100644 index 00000000..612fa758 --- /dev/null +++ b/test/validation/MetadataEncoder.java @@ -0,0 +1,59 @@ +package org.springframework.messaging.rsocket; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import io.netty.buffer.ByteBuf; +import io.netty.buffer.ByteBufAllocator; +import io.netty.buffer.CompositeByteBuf; +import io.rsocket.metadata.CompositeMetadataFlyweight; +import io.rsocket.metadata.TaggingMetadataFlyweight; +import io.rsocket.metadata.WellKnownMimeType; +import reactor.core.publisher.Mono; +import org.springframework.core.ReactiveAdapter; +import org.springframework.core.ResolvableType; +import org.springframework.core.codec.Encoder; +import org.springframework.core.io.buffer.DataBuffer; +import org.springframework.core.io.buffer.DataBufferFactory; +import org.springframework.core.io.buffer.NettyDataBufferFactory; +import org.springframework.lang.Nullable; +import org.springframework.util.Assert; +import org.springframework.util.CollectionUtils; +import org.springframework.util.MimeType; +import org.springframework.util.ObjectUtils; +final class MetadataEncoder { + private static final Pattern VARS_PATTERN = Pattern.compile("\\{(.+?)}"); + private static final Object NO_VALUE = new Object(); + private final MimeType metadataMimeType; + private final RSocketStrategies strategies; + private final boolean isComposite; + private final ByteBufAllocator allocator; + @Nullable + private String route; + private final List metadataEntries = new ArrayList<>(4); + private boolean hasAsyncValues; + MetadataEncoder(MimeType metadataMimeType, RSocketStrategies strategies) { + Assert.notNull(metadataMimeType, "'metadataMimeType' is required"); + Assert.notNull(strategies, "RSocketStrategies is required"); + this.metadataMimeType = metadataMimeType; + this.strategies = strategies; + this.isComposite = this.metadataMimeType.toString().equals( + WellKnownMimeType.MESSAGE_RSOCKET_COMPOSITE_METADATA.getString()); + this.allocator = bufferFactory() instanceof NettyDataBufferFactory ? + ((NettyDataBufferFactory) bufferFactory()).getByteBufAllocator() : ByteBufAllocator.DEFAULT; + } + private DataBufferFactory bufferFactory() { + return this.strategies.dataBufferFactory(); + } + public MetadataEncoder route(String route, Object... routeVars) { + this.route = expand(route, routeVars); + if (!this.isComposite) { + int count = this.route != null ? this.metadataEntries.size() + 1 : this.metadataEntries.size(); + Assert.isTrue(count < 2, "Composite metadata required for multiple metadata entries."); + } + return this; + } + +} \ No newline at end of file diff --git a/test/validation/test_validation.py b/test/validation/test_validation.py index a6002e6e..14b199c7 100644 --- a/test/validation/test_validation.py +++ b/test/validation/test_validation.py @@ -9,31 +9,60 @@ class TestValidation(TestCase): folder = Path(__file__).absolute().parent - def test_validation_semi_2_closing_brackets(self): + def test_validation_semi_2_closing_brackets_with_2_lines_before_block(self): file = self.folder / "DynaMenuModel.java" - lines_extracted_by_semi = list(range(91, 107)) + # range doesn't include the last item + lines_extracted_by_semi = list(range(90, 107)) fixed_lines = fix_start_end_lines_for_opportunity( lines_extracted_by_semi, str(file) ) - self.assertEqual(range(91, 109), fixed_lines) + self.assertEqual((90, 108), fixed_lines) + + def test_validation_semi_2_closing_brackets_without_lines_before_block(self): + file = self.folder / "BaseTextEditor.java" + # range doesn't include the last item + lines_extracted_by_semi = list(range(57, 62)) + fixed_lines = fix_start_end_lines_for_opportunity( + lines_extracted_by_semi, + str(file) + ) + self.assertEqual((57, 63), fixed_lines) def test_semi_no_need_to_find_closing_brackets(self): file = self.folder / "User.java" - lines_extracted_by_semi = list(range(17, 22)) + lines_extracted_by_semi = list(range(16, 22)) fixed_lines = fix_start_end_lines_for_opportunity( lines_extracted_by_semi, str(file) ) - self.assertEqual(range(17, 22), fixed_lines) + self.assertEqual((16, 21), fixed_lines) + + def test_validation_semi_closing_brackets_with_2_blocks(self): + file = self.folder / "CssPreprocessors.java" + lines_extracted_by_semi = list(range(31, 38)) + fixed_lines = fix_start_end_lines_for_opportunity( + lines_extracted_by_semi, + str(file) + ) + self.assertEqual((31, 39), fixed_lines) def test_validation_semi_1_closing_brackets(self): file = self.folder / "NameNodeRpcServer.java" - lines_extracted_by_semi = list(range(231, 234)) + lines_extracted_by_semi = list(range(231, 235)) + fixed_lines = fix_start_end_lines_for_opportunity( + lines_extracted_by_semi, + str(file) + ) + self.assertEqual((231, 235), fixed_lines) + + file = self.folder / "MetadataEncoder.java" + lines_extracted_by_semi = list(range(50, 55)) fixed_lines = fix_start_end_lines_for_opportunity( lines_extracted_by_semi, str(file) ) - self.assertEqual(range(231, 235), fixed_lines) + self.assertEqual((50, 55), fixed_lines) - # def test_1_open_bracket(self): + def test_get_percent_matched(self): + self.assertEqual(0, 0) diff --git a/veniq/dataset_collection/validation.py b/veniq/dataset_collection/validation.py index 5b20a398..2739d579 100644 --- a/veniq/dataset_collection/validation.py +++ b/veniq/dataset_collection/validation.py @@ -58,7 +58,7 @@ class RowResult: def fix_start_end_lines_for_opportunity( extracted_lines_of_opportunity: List[int], - filepath: str) -> range: + filepath: str) -> Tuple[int, int]: """ Finds start and end lines for opportunity @@ -74,21 +74,35 @@ def fix_start_end_lines_for_opportunity( close_brackets = 0 for x in extraction: close_brackets += x.count('}') - for x in extraction: open_brackets += x.count('{') - if open_brackets < close_brackets: + if (open_brackets < close_brackets): diff = close_brackets - open_brackets - while diff > 0: - start_line_opportunity -= 1 - diff -= 1 - elif open_brackets > close_brackets: + count = 1 + for text_line in text[end_line_opportunity:]: + if diff < 1: + break + else: + if text_line.find('{') > -1: + diff -= 1 + count += 1 + + start_line_opportunity += count - 1 + + elif (open_brackets > close_brackets): diff = open_brackets - close_brackets - while diff > 0: - end_line_opportunity += 1 - diff -= 1 + count = 1 + for text_line in text[end_line_opportunity:]: + if diff < 1: + break + else: + if text_line.find('}') > -1: + diff -= 1 + count += 1 + + end_line_opportunity += count - 1 - return range(start_line_opportunity, end_line_opportunity + 1) + return start_line_opportunity, end_line_opportunity def validate_row(dataset_dir: Path, row: pd.Series) \ @@ -144,17 +158,17 @@ def validate_row(dataset_dir: Path, row: pd.Series) \ if opport: best_group = opport[0] lines = [node.line for node in best_group._optimal_opportunity] - # fixed_lines = fix_start_end_lines_for_opportunity( - # lines, - # full_path - # ) - start_line_opportunity = min(lines) - end_line_opportunity = max(lines) + fixed_lines = fix_start_end_lines_for_opportunity( + lines, + full_path + ) + start_line_opportunity = min(fixed_lines) + end_line_opportunity = max(fixed_lines) dataset_range_extraction = range( start_line_of_inserted_block, end_line_of_inserted_block + 1 ) - lines_intersected = set(dataset_range_extraction) & set(lines) + lines_intersected = set(dataset_range_extraction) & set(fixed_lines) result.class_name = class_decl.name result.method_name = ast_node.name result.start_line_SEMI = start_line_opportunity @@ -221,7 +235,7 @@ def validate_row(dataset_dir: Path, row: pd.Series) \ validate_row_f = partial(validate_row, dataset_dir) future = executor.map(validate_row_f, df.iterrows(), timeout=10000, ) result = future.result() - for index, row in tqdm(df.iterrows()): + for index, row in tqdm(df.iterrows(), total=df.shape[0]): try: # print(row['input_filename']) results: List[RowResult] = next(result) From 1a2b4fe0916046c99a526fdae83293971eb2376e Mon Sep 17 00:00:00 2001 From: Evgeny Maslov Date: Mon, 2 Nov 2020 13:09:00 +0300 Subject: [PATCH 13/17] Add tests --- test/validation/test_validation.py | 20 ++++++++++++++++++-- veniq/dataset_collection/validation.py | 8 ++++++-- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/test/validation/test_validation.py b/test/validation/test_validation.py index 14b199c7..284f1ba3 100644 --- a/test/validation/test_validation.py +++ b/test/validation/test_validation.py @@ -1,7 +1,8 @@ from pathlib import Path from unittest import TestCase -from dataset_collection.validation import find_extraction_opportunities, fix_start_end_lines_for_opportunity +from dataset_collection.validation import find_extraction_opportunities, fix_start_end_lines_for_opportunity, \ + percent_matched from veniq.ast_framework import AST, ASTNodeType from veniq.utils.ast_builder import build_ast @@ -65,4 +66,19 @@ def test_validation_semi_1_closing_brackets(self): self.assertEqual((50, 55), fixed_lines) def test_get_percent_matched(self): - self.assertEqual(0, 0) + semi_lines = list(range(50, 58)) + dataset_lines = list(range(50, 58)) + percent = percent_matched(dataset_lines, semi_lines) + self.assertEqual(percent, 1.0) + + def test_percent_partially_matched(self): + semi_lines = list(range(65, 81)) + dataset_lines = list(range(69, 82)) + percent = percent_matched(dataset_lines, semi_lines) + self.assertEqual(percent, 12/13) + + def test_percent_not_matched(self): + semi_lines = list(range(65, 68)) + dataset_lines = list(range(69, 82)) + percent = percent_matched(dataset_lines, semi_lines) + self.assertEqual(percent, 0) diff --git a/veniq/dataset_collection/validation.py b/veniq/dataset_collection/validation.py index 2739d579..42e571df 100644 --- a/veniq/dataset_collection/validation.py +++ b/veniq/dataset_collection/validation.py @@ -168,7 +168,6 @@ def validate_row(dataset_dir: Path, row: pd.Series) \ start_line_of_inserted_block, end_line_of_inserted_block + 1 ) - lines_intersected = set(dataset_range_extraction) & set(fixed_lines) result.class_name = class_decl.name result.method_name = ast_node.name result.start_line_SEMI = start_line_opportunity @@ -179,7 +178,7 @@ def validate_row(dataset_dir: Path, row: pd.Series) \ and (end_line_of_inserted_block == end_line_opportunity): result.matched = True - result.percent_matched = float(len(lines_intersected)) / len(dataset_range_extraction) + result.percent_matched = percent_matched(dataset_range_extraction, fixed_lines) else: result.no_opportunity_chosen = True @@ -202,6 +201,11 @@ def validate_row(dataset_dir: Path, row: pd.Series) \ return results +def percent_matched(dataset_range_lines, semi_range_lines): + lines_intersected = set(dataset_range_lines) & set(semi_range_lines) + return float(len(lines_intersected)) / len(set(dataset_range_lines)) + + if __name__ == '__main__': parser = ArgumentParser() parser.add_argument( From 0507bcace7895dfffbdfef1e89edddbca406d9ee Mon Sep 17 00:00:00 2001 From: Evgeny Maslov Date: Mon, 2 Nov 2020 13:24:55 +0300 Subject: [PATCH 14/17] Fix mypy --- test/validation/test_validation.py | 6 +- veniq/dataset_collection/augmentation.py | 2 +- veniq/dataset_collection/validation.py | 78 ++++++++++++++---------- 3 files changed, 50 insertions(+), 36 deletions(-) diff --git a/test/validation/test_validation.py b/test/validation/test_validation.py index 284f1ba3..4ead85f1 100644 --- a/test/validation/test_validation.py +++ b/test/validation/test_validation.py @@ -1,10 +1,8 @@ from pathlib import Path from unittest import TestCase -from dataset_collection.validation import find_extraction_opportunities, fix_start_end_lines_for_opportunity, \ +from dataset_collection.validation import fix_start_end_lines_for_opportunity, \ percent_matched -from veniq.ast_framework import AST, ASTNodeType -from veniq.utils.ast_builder import build_ast class TestValidation(TestCase): @@ -75,7 +73,7 @@ def test_percent_partially_matched(self): semi_lines = list(range(65, 81)) dataset_lines = list(range(69, 82)) percent = percent_matched(dataset_lines, semi_lines) - self.assertEqual(percent, 12/13) + self.assertEqual(percent, 12 / 13) def test_percent_not_matched(self): semi_lines = list(range(65, 68)) diff --git a/veniq/dataset_collection/augmentation.py b/veniq/dataset_collection/augmentation.py index ce41c3e9..77bd2de2 100644 --- a/veniq/dataset_collection/augmentation.py +++ b/veniq/dataset_collection/augmentation.py @@ -612,4 +612,4 @@ def save_text_to_new_file(input_dir: Path, text: str, filename: Path) -> Path: shutil.rmtree(full_dataset_folder) if small_dataset_folder.exists(): - shutil.rmtree(small_dataset_folder) \ No newline at end of file + shutil.rmtree(small_dataset_folder) diff --git a/veniq/dataset_collection/validation.py b/veniq/dataset_collection/validation.py index 42e571df..6d9e3138 100644 --- a/veniq/dataset_collection/validation.py +++ b/veniq/dataset_collection/validation.py @@ -6,21 +6,20 @@ from pathlib import Path from typing import List, Tuple - import pandas as pd from numpy import mean from pebble import ProcessPool from tqdm import tqdm -from veniq.baselines.semi._common_types import ExtractionOpportunity -from veniq.metrics.ncss.ncss import NCSSMetric -from veniq.utils.encoding_detector import read_text_with_autodetected_encoding +from ast_framework import ASTNode from veniq.ast_framework import AST, ASTNodeType from veniq.baselines.semi.create_extraction_opportunities import create_extraction_opportunities from veniq.baselines.semi.extract_semantic import extract_method_statements_semantic from veniq.baselines.semi.filter_extraction_opportunities import filter_extraction_opportunities from veniq.baselines.semi.rank_extraction_opportunities import rank_extraction_opportunities, ExtractionOpportunityGroup +from veniq.metrics.ncss.ncss import NCSSMetric from veniq.utils.ast_builder import build_ast +from veniq.utils.encoding_detector import read_text_with_autodetected_encoding def find_extraction_opportunities( @@ -76,7 +75,7 @@ def fix_start_end_lines_for_opportunity( close_brackets += x.count('}') open_brackets += x.count('{') - if (open_brackets < close_brackets): + if open_brackets < close_brackets: diff = close_brackets - open_brackets count = 1 for text_line in text[end_line_opportunity:]: @@ -89,7 +88,7 @@ def fix_start_end_lines_for_opportunity( start_line_opportunity += count - 1 - elif (open_brackets > close_brackets): + elif open_brackets > close_brackets: diff = open_brackets - close_brackets count = 1 for text_line in text[end_line_opportunity:]: @@ -105,6 +104,7 @@ def fix_start_end_lines_for_opportunity( return start_line_opportunity, end_line_opportunity +# flake8: noqa: C901 def validate_row(dataset_dir: Path, row: pd.Series) \ -> List[RowResult]: """ @@ -156,29 +156,15 @@ def validate_row(dataset_dir: Path, row: pd.Series) \ ast_subtree = ast.get_subtree(ast_node) opport = find_extraction_opportunities(ast_subtree) if opport: - best_group = opport[0] - lines = [node.line for node in best_group._optimal_opportunity] - fixed_lines = fix_start_end_lines_for_opportunity( - lines, - full_path - ) - start_line_opportunity = min(fixed_lines) - end_line_opportunity = max(fixed_lines) - dataset_range_extraction = range( + find_matched_lines( + ast_node, + ast_subtree, + class_decl, start_line_of_inserted_block, - end_line_of_inserted_block + 1 - ) - result.class_name = class_decl.name - result.method_name = ast_node.name - result.start_line_SEMI = start_line_opportunity - result.end_line_SEMI = end_line_opportunity - result.ncss = NCSSMetric().value(ast_subtree) - - if (start_line_of_inserted_block == start_line_opportunity) \ - and (end_line_of_inserted_block == end_line_opportunity): - result.matched = True - - result.percent_matched = percent_matched(dataset_range_extraction, fixed_lines) + end_line_of_inserted_block, + full_path, + opport, + result) else: result.no_opportunity_chosen = True @@ -201,6 +187,38 @@ def validate_row(dataset_dir: Path, row: pd.Series) \ return results +def find_matched_lines( + ast_node: ASTNode, + ast_subtree: AST, + class_decl: ASTNode, + start_line_of_inserted_block: int, + end_line_of_inserted_block: int, + full_path: str, + opportunities_list: List[ExtractionOpportunityGroup], + result: RowResult) -> None: + best_group = opportunities_list[0] + lines = [node.line for node in best_group._optimal_opportunity] + fixed_lines = fix_start_end_lines_for_opportunity( + lines, + full_path + ) + start_line_opportunity = min(fixed_lines) + end_line_opportunity = max(fixed_lines) + dataset_range_extraction = range( + start_line_of_inserted_block, + end_line_of_inserted_block + 1 + ) + result.class_name = class_decl.name + result.method_name = ast_node.name + result.start_line_SEMI = start_line_opportunity + result.end_line_SEMI = end_line_opportunity + result.ncss = NCSSMetric().value(ast_subtree) + if (start_line_of_inserted_block == start_line_opportunity) \ + and (end_line_of_inserted_block == end_line_opportunity): + result.matched = True + result.percent_matched = percent_matched(dataset_range_extraction, fixed_lines) + + def percent_matched(dataset_range_lines, semi_range_lines): lines_intersected = set(dataset_range_lines) & set(semi_range_lines) return float(len(lines_intersected)) / len(set(dataset_range_lines)) @@ -241,13 +259,11 @@ def percent_matched(dataset_range_lines, semi_range_lines): result = future.result() for index, row in tqdm(df.iterrows(), total=df.shape[0]): try: - # print(row['input_filename']) results: List[RowResult] = next(result) for res in results: output_df = output_df.append(asdict(res), ignore_index=True) output_df.to_csv('matched.csv') - except Exception as e: - # print(f"Exception inside thread happened: {str(e)}") + except Exception: print(traceback.format_exc()) continue From fda4e8c12789c8f917fff916e1a4799d16cdafd0 Mon Sep 17 00:00:00 2001 From: Evgeny Maslov Date: Mon, 2 Nov 2020 13:27:52 +0300 Subject: [PATCH 15/17] Fix tests --- test/validation/test_validation.py | 2 +- veniq/dataset_collection/validation.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/validation/test_validation.py b/test/validation/test_validation.py index 4ead85f1..59d1a5d3 100644 --- a/test/validation/test_validation.py +++ b/test/validation/test_validation.py @@ -1,7 +1,7 @@ from pathlib import Path from unittest import TestCase -from dataset_collection.validation import fix_start_end_lines_for_opportunity, \ +from veniq.dataset_collection.validation import fix_start_end_lines_for_opportunity, \ percent_matched diff --git a/veniq/dataset_collection/validation.py b/veniq/dataset_collection/validation.py index 6d9e3138..7e18120f 100644 --- a/veniq/dataset_collection/validation.py +++ b/veniq/dataset_collection/validation.py @@ -11,7 +11,7 @@ from pebble import ProcessPool from tqdm import tqdm -from ast_framework import ASTNode +from veniq.ast_framework import ASTNode from veniq.ast_framework import AST, ASTNodeType from veniq.baselines.semi.create_extraction_opportunities import create_extraction_opportunities from veniq.baselines.semi.extract_semantic import extract_method_statements_semantic From 1bc94a9e8833b62bb355b255527ea29ceb5375e2 Mon Sep 17 00:00:00 2001 From: Evgeny Maslov Date: Mon, 2 Nov 2020 15:46:39 +0300 Subject: [PATCH 16/17] Fix comments --- veniq/dataset_collection/validation.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/veniq/dataset_collection/validation.py b/veniq/dataset_collection/validation.py index 7e18120f..57d0a9d0 100644 --- a/veniq/dataset_collection/validation.py +++ b/veniq/dataset_collection/validation.py @@ -11,6 +11,7 @@ from pebble import ProcessPool from tqdm import tqdm +from baselines.semi._common_types import ExtractionOpportunity from veniq.ast_framework import ASTNode from veniq.ast_framework import AST, ASTNodeType from veniq.baselines.semi.create_extraction_opportunities import create_extraction_opportunities @@ -23,7 +24,7 @@ def find_extraction_opportunities( - method_ast: AST): + method_ast: AST) -> List[ExtractionOpportunityGroup]: statements_semantic = extract_method_statements_semantic(method_ast) extraction_opportunities = create_extraction_opportunities(statements_semantic) filtered_extraction_opportunities = filter_extraction_opportunities( @@ -112,9 +113,8 @@ def validate_row(dataset_dir: Path, row: pd.Series) \ :param dataset_dir: directory to dataset, path before the relative path in output_filename - :param row: row of dataframe - :return: boolean value whether we should consider this row or skip it, - Stats - return collected stats + :param row: row of dataframe of synth validation dataset + :return: Stats - return collected stats """ results = [] try: @@ -136,7 +136,7 @@ def validate_row(dataset_dir: Path, row: pd.Series) \ result = RowResult( output_filename=full_path, input_filename=row[1]['input_filename'], - class_name='', + class_name='Not available', method_name='', start_line_SEMI=-1, end_line_SEMI=-1, @@ -233,7 +233,7 @@ def percent_matched(dataset_range_lines, semi_range_lines): ) parser.add_argument( "-i", "--csv_input", - help="Path for csv" + help="Path for csv with synth dataset" ) system_cores_qty = os.cpu_count() or 1 parser.add_argument( From 98655bb368a2c494a71fac28e3f60ceabccf3df4 Mon Sep 17 00:00:00 2001 From: Evgeny Maslov Date: Mon, 2 Nov 2020 17:07:14 +0300 Subject: [PATCH 17/17] Fix tests --- test/validation/test_validation.py | 26 ++++++++++++++------------ veniq/dataset_collection/validation.py | 26 ++++++++++++++------------ 2 files changed, 28 insertions(+), 24 deletions(-) diff --git a/test/validation/test_validation.py b/test/validation/test_validation.py index 59d1a5d3..b020f87d 100644 --- a/test/validation/test_validation.py +++ b/test/validation/test_validation.py @@ -11,57 +11,59 @@ class TestValidation(TestCase): def test_validation_semi_2_closing_brackets_with_2_lines_before_block(self): file = self.folder / "DynaMenuModel.java" # range doesn't include the last item - lines_extracted_by_semi = list(range(90, 107)) + # also, add it as it would be numbered starting at 1 + lines_extracted_by_semi = list(range(91, 109)) fixed_lines = fix_start_end_lines_for_opportunity( lines_extracted_by_semi, str(file) ) - self.assertEqual((90, 108), fixed_lines) + self.assertEqual((91, 108), fixed_lines) def test_validation_semi_2_closing_brackets_without_lines_before_block(self): file = self.folder / "BaseTextEditor.java" # range doesn't include the last item - lines_extracted_by_semi = list(range(57, 62)) + # also, add it as it would be numbered starting at 1 + lines_extracted_by_semi = list(range(58, 62)) fixed_lines = fix_start_end_lines_for_opportunity( lines_extracted_by_semi, str(file) ) - self.assertEqual((57, 63), fixed_lines) + self.assertEqual((58, 63), fixed_lines) def test_semi_no_need_to_find_closing_brackets(self): file = self.folder / "User.java" - lines_extracted_by_semi = list(range(16, 22)) + lines_extracted_by_semi = list(range(17, 22)) fixed_lines = fix_start_end_lines_for_opportunity( lines_extracted_by_semi, str(file) ) - self.assertEqual((16, 21), fixed_lines) + self.assertEqual((17, 21), fixed_lines) def test_validation_semi_closing_brackets_with_2_blocks(self): file = self.folder / "CssPreprocessors.java" - lines_extracted_by_semi = list(range(31, 38)) + lines_extracted_by_semi = list(range(32, 38)) fixed_lines = fix_start_end_lines_for_opportunity( lines_extracted_by_semi, str(file) ) - self.assertEqual((31, 39), fixed_lines) + self.assertEqual((32, 39), fixed_lines) def test_validation_semi_1_closing_brackets(self): file = self.folder / "NameNodeRpcServer.java" - lines_extracted_by_semi = list(range(231, 235)) + lines_extracted_by_semi = list(range(232, 235)) fixed_lines = fix_start_end_lines_for_opportunity( lines_extracted_by_semi, str(file) ) - self.assertEqual((231, 235), fixed_lines) + self.assertEqual((232, 235), fixed_lines) file = self.folder / "MetadataEncoder.java" - lines_extracted_by_semi = list(range(50, 55)) + lines_extracted_by_semi = list(range(51, 55)) fixed_lines = fix_start_end_lines_for_opportunity( lines_extracted_by_semi, str(file) ) - self.assertEqual((50, 55), fixed_lines) + self.assertEqual((51, 55), fixed_lines) def test_get_percent_matched(self): semi_lines = list(range(50, 58)) diff --git a/veniq/dataset_collection/validation.py b/veniq/dataset_collection/validation.py index 57d0a9d0..1c2fc206 100644 --- a/veniq/dataset_collection/validation.py +++ b/veniq/dataset_collection/validation.py @@ -11,9 +11,8 @@ from pebble import ProcessPool from tqdm import tqdm -from baselines.semi._common_types import ExtractionOpportunity -from veniq.ast_framework import ASTNode from veniq.ast_framework import AST, ASTNodeType +from veniq.ast_framework import ASTNode from veniq.baselines.semi.create_extraction_opportunities import create_extraction_opportunities from veniq.baselines.semi.extract_semantic import extract_method_statements_semantic from veniq.baselines.semi.filter_extraction_opportunities import filter_extraction_opportunities @@ -69,7 +68,8 @@ def fix_start_end_lines_for_opportunity( start_line_opportunity = min(extracted_lines_of_opportunity) end_line_opportunity = max(extracted_lines_of_opportunity) text = read_text_with_autodetected_encoding(filepath).split('\n') - extraction = text[start_line_opportunity:end_line_opportunity] + + extraction = text[start_line_opportunity - 1:end_line_opportunity] open_brackets = 0 close_brackets = 0 for x in extraction: @@ -80,9 +80,7 @@ def fix_start_end_lines_for_opportunity( diff = close_brackets - open_brackets count = 1 for text_line in text[end_line_opportunity:]: - if diff < 1: - break - else: + while diff > 0: if text_line.find('{') > -1: diff -= 1 count += 1 @@ -93,9 +91,7 @@ def fix_start_end_lines_for_opportunity( diff = open_brackets - close_brackets count = 1 for text_line in text[end_line_opportunity:]: - if diff < 1: - break - else: + while diff > 0: if text_line.find('}') > -1: diff -= 1 count += 1 @@ -128,9 +124,7 @@ def validate_row(dataset_dir: Path, row: pd.Series) \ function_to_analyze = row[1]['method_where_invocation_occurred'] for class_decl in ast.get_proxy_nodes(ASTNodeType.CLASS_DECLARATION): - if class_decl.name != class_name: - continue - elif class_decl.name == class_name: + if class_decl.name == class_name: objects_to_consider = list(class_decl.methods) + list(class_decl.constructors) or [] for ast_node in objects_to_consider: result = RowResult( @@ -154,8 +148,11 @@ def validate_row(dataset_dir: Path, row: pd.Series) \ continue try: ast_subtree = ast.get_subtree(ast_node) + print(src_filename, 'start') opport = find_extraction_opportunities(ast_subtree) + print(src_filename, 'end') if opport: + print(src_filename, 'start find_matched_lines') find_matched_lines( ast_node, ast_subtree, @@ -165,6 +162,8 @@ def validate_row(dataset_dir: Path, row: pd.Series) \ full_path, opport, result) + + print(src_filename, 'end find_matched_lines') else: result.no_opportunity_chosen = True @@ -196,6 +195,9 @@ def find_matched_lines( full_path: str, opportunities_list: List[ExtractionOpportunityGroup], result: RowResult) -> None: + + if Path(full_path).stem == 'ParametersPickerOperator_cf13c04617679fdf0fe1779623e8a28e41e89e045c640a1f507d166ba1e8370f_verify_111': + print() best_group = opportunities_list[0] lines = [node.line for node in best_group._optimal_opportunity] fixed_lines = fix_start_end_lines_for_opportunity(