Skip to content

Commit

Permalink
refactor: format code with black and do more intuitive view of the py…
Browse files Browse the repository at this point in the history
…thon report table

- Format with black;
- Handle AnnAssign, AugAssign, Assign classes and parse it to descriptive names for pretty view in the report table;
- Add lineno into report table for python works;
- Add missing unit tests.

Refs: #175
  • Loading branch information
Artanias authored Sep 24, 2023
1 parent 612b5ab commit 6a81f73
Show file tree
Hide file tree
Showing 43 changed files with 1,713 additions and 1,896 deletions.
4 changes: 4 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
default_language_version:
python: python3.8
repos:
- repo: https://github.com/psf/black
rev: 23.9.1
hooks:
- id: black
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.0.287
hooks:
Expand Down
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
UTIL_VERSION := 0.3.6
UTIL_VERSION := 0.3.7
UTIL_NAME := codeplag
PWD := $(shell pwd)

Expand All @@ -10,6 +10,7 @@ BASE_DOCKER_TAG := $(shell echo $(UTIL_NAME)-base-ubuntu20.04:$(BASE_DOC
TEST_DOCKER_TAG := $(shell echo $(UTIL_NAME)-test-ubuntu20.04:$(UTIL_VERSION) | tr A-Z a-z)
DOCKER_TAG ?= $(shell echo $(UTIL_NAME)-ubuntu20.04:$(UTIL_VERSION) | tr A-Z a-z)

PYTHONDONTWRITEBYTECODE := "1"
PYTHONPATH := $(PWD)/src/:$(PWD)/test/auto

LOGS_PATH := /var/log/$(UTIL_NAME)
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@
$ register-python-argcomplete codeplag >> ~/.bashrc
```

## 5. Demo examples (works in the project directory and with an installed codeplag package)
## 4. Demo examples (works in the project directory and with an installed codeplag package)

- Python analyzer
```
Expand Down
122 changes: 54 additions & 68 deletions docs/notebooks/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,14 @@ def remove_unnecessary_blank_lines(source_code: str) -> str:


def get_data_from_dir(
path: str = './data',
max_count_lines: Optional[int] = None
path: str = "./data", max_count_lines: Optional[int] = None
) -> pd.DataFrame:
df = pd.DataFrame()
for filename in os.listdir(path):
if not re.search(r'.csv$', filename):
if not re.search(r".csv$", filename):
continue

tmp_df = pd.read_csv(os.path.join(path, filename), sep=';', index_col=0)
tmp_df = pd.read_csv(os.path.join(path, filename), sep=";", index_col=0)
df = df.append(tmp_df, ignore_index=True)

if max_count_lines:
Expand All @@ -52,43 +51,43 @@ def get_data_from_dir(


def save_works_from_repo_url(url: str, check_policy: bool = True) -> None:
current_repo_name = url.split('/')[-1]
env_config = Config(RepositoryEnv('../../.env'))
current_repo_name = url.split("/")[-1]
env_config = Config(RepositoryEnv("../../.env"))
gh = GitHubParser(
file_extensions=(re.compile(r'.py$'),),
file_extensions=(re.compile(r".py$"),),
check_all=check_policy,
access_token=env_config.get('ACCESS_TOKEN')
access_token=env_config.get("ACCESS_TOKEN"),
)
files = list(gh.get_files_generator_from_repo_url(url))
files = [(remove_unnecessary_blank_lines(file.code), file.link) for file in files]

df = pd.DataFrame(
{
'content': [file_[0] for file_ in files[:-1]],
'link': [file_[1] for file_ in files[:-1]],
'extension': ['py'] * (len(files) - 1),
'repo_name': [current_repo_name] * (len(files) - 1),
'content_len': [len(file_[0]) for file_ in files[:-1]],
'content_len_without_blank': [
len(file_[0].replace(' ', '').replace('\n', '').replace('\t', ''))
"content": [file_[0] for file_ in files[:-1]],
"link": [file_[1] for file_ in files[:-1]],
"extension": ["py"] * (len(files) - 1),
"repo_name": [current_repo_name] * (len(files) - 1),
"content_len": [len(file_[0]) for file_ in files[:-1]],
"content_len_without_blank": [
len(file_[0].replace(" ", "").replace("\n", "").replace("\t", ""))
for file_ in files[:-1]
],
'count_lines_without_blank_lines': [
"count_lines_without_blank_lines": [
len(file_[0].splitlines()) for file_ in files[:-1]
]
],
}
)
df = df[df['count_lines_without_blank_lines'] > 5]
df.to_csv(os.path.join('./data/', current_repo_name + '.csv'), sep=';')
df = df[df["count_lines_without_blank_lines"] > 5]
df.to_csv(os.path.join("./data/", current_repo_name + ".csv"), sep=";")


def get_time_to_meta(df: pd.DataFrame, iterations: int = 10) -> pd.DataFrame:
count_lines = []
to_meta_time = []
for (index, content) in df[
['content', 'link', 'count_lines_without_blank_lines']
for index, content in df[
["content", "link", "count_lines_without_blank_lines"]
].iterrows():
print(index, " " * 20, end='\r')
print(index, " " * 20, end="\r")
for _ in range(iterations):
tree = get_ast_from_content(content[0], content[1])
if tree is None:
Expand All @@ -102,12 +101,7 @@ def get_time_to_meta(df: pd.DataFrame, iterations: int = 10) -> pd.DataFrame:
except Exception:
break

output = pd.DataFrame(
{
'count_lines': count_lines,
'times': to_meta_time
}
)
output = pd.DataFrame({"count_lines": count_lines, "times": to_meta_time})

return output

Expand All @@ -118,7 +112,7 @@ def plot_and_save_result(
ylabel: str,
title: str,
what: str,
trend: Literal['linear', 'n^2', 'n^3', 'n^4'] = 'linear'
trend: Literal["linear", "n^2", "n^3", "n^4"] = "linear",
) -> None:
# Simple Moving average
unique_count_lines = np.unique(df.count_lines)
Expand All @@ -135,75 +129,72 @@ def plot_and_save_result(
plt.figure(figsize=(12, 12), dpi=80)
# plt.plot(unique_count_lines, mean_times, label='Среднее')

if trend == 'linear':
if trend == "linear":
z = np.polyfit(unique_count_lines, mean_times, 1)
p = np.poly1d(z)
plt.plot(
unique_count_lines, p(unique_count_lines), "r--", label='Линейный тренд.'
unique_count_lines, p(unique_count_lines), "r--", label="Линейный тренд."
)
elif trend == 'n^2':
elif trend == "n^2":
popt_cons, _ = curve_fit(
square_func,
unique_count_lines,
mean_times,
bounds=([-np.inf, 0., 0.], [np.inf, 0.1 ** 100, 0.1 ** 100])
bounds=([-np.inf, 0.0, 0.0], [np.inf, 0.1**100, 0.1**100]),
)
p = np.poly1d(popt_cons)
plt.plot(
unique_count_lines,
p(unique_count_lines),
"r--", label='Квадратичный тренд.'
"r--",
label="Квадратичный тренд.",
)
elif trend == 'n^3':
elif trend == "n^3":
popt_cons, _ = curve_fit(
cube_func,
unique_count_lines,
mean_times,
bounds=([-np.inf, 0., 0., 0.], [np.inf, 0.1 ** 100, 0.1 ** 100, 0.1 ** 100])
bounds=(
[-np.inf, 0.0, 0.0, 0.0],
[np.inf, 0.1**100, 0.1**100, 0.1**100],
),
)
p = np.poly1d(popt_cons)
plt.plot(
unique_count_lines,
p(unique_count_lines),
"r--",
label='Кубический тренд.'
unique_count_lines, p(unique_count_lines), "r--", label="Кубический тренд."
)
elif trend == 'n^4':
elif trend == "n^4":
popt_cons, _ = curve_fit(
quart_func,
unique_count_lines,
mean_times,
bounds=(
[-np.inf, 0., 0., 0., 0.],
[np.inf, 0.1 ** 100, 0.1 ** 100, 0.1 ** 100, 0.1 ** 100]
)
[-np.inf, 0.0, 0.0, 0.0, 0.0],
[np.inf, 0.1**100, 0.1**100, 0.1**100, 0.1**100],
),
)
p = np.poly1d(popt_cons)
plt.plot(unique_count_lines, p(unique_count_lines), "r--", label='n^4.')
plt.plot(unique_count_lines, p(unique_count_lines), "r--", label="n^4.")
else:
raise Exception(f"Incorrect tred '{trend}'.")

rolling = pd.DataFrame(
{
'unique_count_lines': unique_count_lines,
'mean_times': mean_times
}
{"unique_count_lines": unique_count_lines, "mean_times": mean_times}
)
num_window = 20
plt.plot(
rolling.unique_count_lines,
rolling.mean_times.rolling(window=num_window).mean(),
label=f'Скользящее среднее по {num_window}ти замерам.'
label=f"Скользящее среднее по {num_window}ти замерам.",
)

plt.ylabel(ylabel, fontsize=15)
plt.xlabel(xlabel, fontsize=15)
plt.title(title, fontsize=17)
plt.legend(loc='upper left')
plt.legend(loc="upper left")
plt.savefig(
'./graphics/need_time_{}_{}.png'.format(
what,
datetime.now().strftime("%d%m%Y_%H%M%S")
"./graphics/need_time_{}_{}.png".format(
what, datetime.now().strftime("%d%m%Y_%H%M%S")
)
)

Expand All @@ -212,7 +203,7 @@ def get_time_algorithms(
df: pd.DataFrame,
work,
iterations: int = 5,
metric: Literal['fast', 'gst', 'structure'] = 'fast'
metric: Literal["fast", "gst", "structure"] = "fast",
) -> pd.DataFrame:
count_lines = []
times = []
Expand All @@ -221,11 +212,11 @@ def get_time_algorithms(
raise Exception("Unexpected error when parsing first work.")

features1 = get_features_from_ast(tree1, work.link)
for (index, content) in df[
['content', 'link', 'count_lines_without_blank_lines']
for index, content in df[
["content", "link", "count_lines_without_blank_lines"]
].iterrows():
for _ in range(iterations):
print(index, " " * 20, end='\r')
print(index, " " * 20, end="\r")
tree2 = get_ast_from_content(content[0], content[1])
if tree2 is None:
continue
Expand All @@ -234,34 +225,29 @@ def get_time_algorithms(
except Exception:
continue

if metric == 'fast':
if metric == "fast":
start = perf_counter()
value_jakkar_coef(features1.tokens, features2.tokens)
counter_metric(features1.operators, features2.operators)
counter_metric(features1.keywords, features2.keywords)
counter_metric(features1.literals, features2.literals)
end = perf_counter() - start
times.append(end)
elif metric == 'gst':
elif metric == "gst":
start = perf_counter()
gst(features1.tokens, features2.tokens, 6)
end = perf_counter() - start
times.append(end)
elif metric == 'structure':
elif metric == "structure":
start = perf_counter()
struct_compare(features1.structure, features2.structure)
end = perf_counter() - start
times.append(end)
else:
raise Exception('Incorrect metric!')
raise Exception("Incorrect metric!")

count_lines.append(content[2])

output = pd.DataFrame(
{
'count_lines': count_lines,
'times': times
}
)
output = pd.DataFrame({"count_lines": count_lines, "times": times})

return output
70 changes: 35 additions & 35 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,58 +5,58 @@
from setuptools import find_packages, setup

INSTALL_REQUIREMENTS = [
'argcomplete~=2.0.0',
'numpy~=1.23.5',
'pandas~=1.4.3',
'ccsyspath~=1.1.0',
'clang~=16.0.1.1',
'llvmlite~=0.40.1',
'libclang~=16.0.0',
'python-decouple~=3.6',
'requests~=2.31.0',
'typing-extensions~=4.3.0',
'aiohttp~=3.8.5',
'cachetools==5.3.1',
'gidgethub~=5.3.0',
"argcomplete~=2.0.0",
"numpy~=1.23.5",
"pandas~=1.4.3",
"ccsyspath~=1.1.0",
"clang~=16.0.1.1",
"llvmlite~=0.40.1",
"libclang~=16.0.0",
"python-decouple~=3.6",
"requests~=2.31.0",
"typing-extensions~=4.3.0",
"aiohttp~=3.8.5",
"cachetools==5.3.1",
"gidgethub~=5.3.0",
]
UTIL_NAME = os.getenv('UTIL_NAME')
UTIL_VERSION = os.getenv('UTIL_VERSION')
UTIL_NAME = os.getenv("UTIL_NAME")
UTIL_VERSION = os.getenv("UTIL_VERSION")


if '--install-requirements' in sys.argv:
print(' '.join(INSTALL_REQUIREMENTS))
if "--install-requirements" in sys.argv:
print(" ".join(INSTALL_REQUIREMENTS))
sys.exit(0)
elif UTIL_NAME is None or UTIL_VERSION is None:
print('Please provide UTIL_NAME and UTIL_VERSION environment variables.')
print("Please provide UTIL_NAME and UTIL_VERSION environment variables.")
sys.exit(1)


setup(
name=f'{UTIL_NAME}',
version=f'{UTIL_VERSION}',
description='Code plagiarism searching package',
author='Artyom Semidolin, Dmitry Nikolaev, Alexander Evsikov',
url='https://github.com/OSLL/code-plagiarism',
name=f"{UTIL_NAME}",
version=f"{UTIL_VERSION}",
description="Code plagiarism searching package",
author="Artyom Semidolin, Dmitry Nikolaev, Alexander Evsikov",
url="https://github.com/OSLL/code-plagiarism",
long_description=Path("README.md").read_text(encoding="utf-8"),
long_description_content_type="text/markdown",
license='MIT License',
license="MIT License",
platforms=["linux"],
classifiers=[
'Development Status :: 4 - Beta',
'Environment :: Console',
'Intended Audience :: Education',
'License :: OSI Approved :: MIT License',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.8',
'Topic :: Software Development :: Plagiarism Detection',
"Development Status :: 4 - Beta",
"Environment :: Console",
"Intended Audience :: Education",
"License :: OSI Approved :: MIT License",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.8",
"Topic :: Software Development :: Plagiarism Detection",
],
package_dir={"": "src"},
packages=find_packages("src"),
python_requires='>=3.8',
python_requires=">=3.8",
install_requires=INSTALL_REQUIREMENTS,
entry_points={
'console_scripts': [
'codeplag = codeplag:main',
"console_scripts": [
"codeplag = codeplag:main",
]
}
},
)
Loading

0 comments on commit 6a81f73

Please sign in to comment.