refactor: optimizes featurebased algorithms.

OSLL · Sep 28, 2024 · c44361a · c44361a
1 parent 3eef1bc
commit c44361a
Show file tree

Hide file tree

Showing 7 changed files with 18 additions and 17 deletions.
diff --git a/Makefile b/Makefile
@@ -1,4 +1,4 @@
-UTIL_VERSION            := 0.5.4
+UTIL_VERSION            := 0.5.5
 UTIL_NAME               := codeplag
 PWD                     := $(shell pwd)
 

diff --git a/src/codeplag/algorithms/compare.py b/src/codeplag/algorithms/compare.py
@@ -88,7 +88,7 @@ def compare_works(
     if threshold and (fast_metrics.weighted_average * 100.0) < threshold:
         return CompareInfo(fast=fast_metrics)
 
-    compliance_matrix = np.zeros(
+    compliance_matrix = np.empty(
         (len(features1.head_nodes), len(features2.head_nodes), 2), dtype=np.int64
     )
     struct_res = struct_compare(features1.structure, features2.structure, compliance_matrix)

diff --git a/src/codeplag/algorithms/featurebased.py b/src/codeplag/algorithms/featurebased.py
@@ -19,22 +19,23 @@ def counter_metric(counter1: Mapping[str, int], counter2: Mapping[str, int]) ->
     if len(counter1) == 0 and len(counter2) == 0:
         return 1.0
 
-    percent_of_same = [0, 0]
+    percent_of_same_numerator = 0
+    percent_of_same_denominator = 0
     for key in counter1:
         if key not in counter2:
-            percent_of_same[1] += counter1[key]
+            percent_of_same_denominator += counter1[key]
             continue
-        percent_of_same[0] += min(counter1[key], counter2[key])
-        percent_of_same[1] += max(counter1[key], counter2[key])
+        percent_of_same_numerator += min(counter1[key], counter2[key])
+        percent_of_same_denominator += max(counter1[key], counter2[key])
     for key in counter2:
         if key not in counter1:
-            percent_of_same[1] += counter2[key]
+            percent_of_same_denominator += counter2[key]
             continue
 
-    if percent_of_same[1] == 0:
+    if percent_of_same_denominator == 0:
         return 0.0
 
-    return percent_of_same[0] / percent_of_same[1]
+    return percent_of_same_numerator / percent_of_same_denominator
 
 
 def op_shift_metric(ops1: list[str], ops2: list[str]) -> tuple[int, float]:
@@ -54,7 +55,7 @@ def op_shift_metric(ops1: list[str], ops2: list[str]) -> tuple[int, float]:
         ops1, ops2 = ops2, ops1
         count_el_f, count_el_s = count_el_s, count_el_f
 
-    y = np.zeros(count_el_s, dtype=np.float32)
+    y = np.empty(count_el_s, dtype=np.float32)
 
     shift = 0
     while shift < count_el_s:
@@ -234,11 +235,11 @@ def struct_compare(
     key_indexes1.append(count_of_nodes1)
     key_indexes2.append(count_of_nodes2)
 
-    array = np.zeros((count_of_children1, count_of_children2, 2), dtype=np.int64)
+    array = np.empty((count_of_children1, count_of_children2, 2), dtype=np.int64)
 
     for i in np.arange(0, count_of_children1, 1):
+        section1 = tree1[key_indexes1[i] + 1 : key_indexes1[i + 1]]
         for j in np.arange(0, count_of_children2, 1):
-            section1 = tree1[key_indexes1[i] + 1 : key_indexes1[i + 1]]
             section2 = tree2[key_indexes2[j] + 1 : key_indexes2[j + 1]]
             array[i][j] = struct_compare(section1, section2)
 

diff --git a/src/codeplag/algorithms/stringbased.py b/src/codeplag/algorithms/stringbased.py
@@ -11,7 +11,7 @@ def __init__(self: Self, sequence1: Sequence, sequence2: Sequence) -> None:
         self.s1_length = len(sequence1)
         self.s2_length = len(sequence2)
         self.distance = -1
-        self.distance_matrix = np.zeros((self.s1_length + 1, self.s2_length + 1), dtype=np.int64)
+        self.distance_matrix = np.empty((self.s1_length + 1, self.s2_length + 1), dtype=np.int64)
 
     @staticmethod
     def m(symbol1: str, symbol2: str) -> Literal[0, 1]:

diff --git a/src/codeplag/handlers/check.py b/src/codeplag/handlers/check.py
@@ -348,7 +348,7 @@ def compliance_matrix_to_df(
     head_nodes1: list[str],
     head_nodes2: list[str],
 ) -> pd.DataFrame:
-    data = np.zeros(
+    data = np.empty(
         (
             compliance_matrix.shape[0],
             compliance_matrix.shape[1],

diff --git a/src/codeplag/handlers/report.py b/src/codeplag/handlers/report.py
@@ -76,7 +76,7 @@ def html_report_create(report_path: Path, report_type: ReportType) -> Literal[0,
 
 def _convert_similarity_matrix_to_percent_matrix(matrix: NDArray) -> NDArray:
     """Convert compliance matrix of size N x M x 2 to percent 2 dimensional matrix."""
-    percent_matrix = np.zeros((matrix.shape[0], matrix.shape[1]), dtype=np.float64)
+    percent_matrix = np.empty((matrix.shape[0], matrix.shape[1]), dtype=np.float64)
     for i in range(matrix.shape[0]):
         for j in range(matrix.shape[1]):
             percent_matrix[i][j] = round(matrix[i][j][0] / matrix[i][j][1] * 100, 2)

diff --git a/test/unit/codeplag/algorithms/test_featurebased.py b/test/unit/codeplag/algorithms/test_featurebased.py
@@ -140,7 +140,7 @@ def test_struct_compare_normal(self: Self) -> None:
                       (3, 8), (3, 8), (2, 9)]
         count_ch1 = (get_children_indexes(structure1, len(structure1)))[1]
         count_ch2 = (get_children_indexes(structure2, len(structure2)))[1]
-        compliance_matrix = np.zeros((count_ch1, count_ch2, 2),
+        compliance_matrix = np.empty((count_ch1, count_ch2, 2),
                                      dtype=np.int64)
         res = struct_compare(structure1, structure2,
                              compliance_matrix)
@@ -160,7 +160,7 @@ def test_struct_compare_normal(self: Self) -> None:
                       (4, 4), (5, 8), (4, 10), (5, 4)]
         count_ch1 = (get_children_indexes(structure1, len(structure1)))[1]
         count_ch2 = (get_children_indexes(structure2, len(structure2)))[1]
-        compliance_matrix = np.zeros((count_ch1, count_ch2, 2),
+        compliance_matrix = np.empty((count_ch1, count_ch2, 2),
                                      dtype=np.int64)
         res = struct_compare(structure1, structure2,
                              compliance_matrix)