Skip to content

Commit

Permalink
add text extraction from python code comments
Browse files Browse the repository at this point in the history
  • Loading branch information
Илья Лебедев committed Jul 24, 2019
1 parent 526b245 commit bc986de
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 9 deletions.
2 changes: 1 addition & 1 deletion rozental_as_a_service/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.0.4'
__version__ = '0.0.5'
7 changes: 6 additions & 1 deletion rozental_as_a_service/files_utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
from typing import List
from pathlib import Path

Expand All @@ -8,4 +9,8 @@ def is_path_in_exclude_list(path: str, exclude: List[str]) -> bool:

def get_all_filepathes_recursively(path: str, exclude: List[str], extension: str) -> List[str]:
pathlist = Path(path).glob(f'**/*.{extension}')
return [str(p) for p in pathlist if not is_path_in_exclude_list(str(p), exclude)]
return [
str(p) for p in pathlist
if not is_path_in_exclude_list(str(p), exclude)
and not os.path.isdir(str(p))
]
31 changes: 24 additions & 7 deletions rozental_as_a_service/strings_extractors.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,22 @@
import ast
import io
import re

from typing import List
import tokenize

from bs4 import BeautifulSoup
from markdown import markdown
from esprima import tokenize, Error
from esprima import tokenize as esprima_tokenize, Error

from rozental_as_a_service.ast_utils import extract_all_constants_from_ast


def extract_from_python_src(raw_content: str) -> List[str]:
try:
ast_tree = ast.parse(raw_content)
except SyntaxError:
return []
return extract_all_constants_from_ast(ast_tree)
return list(set(
_extract_from_python_ast(raw_content)
+ _extract_from_python_code_comments(raw_content)
))


def extract_from_html(raw_content: str) -> List[str]:
Expand All @@ -31,7 +32,23 @@ def extract_from_markdown(raw_content: str) -> List[str]:

def extract_from_js(raw_content: str) -> List[str]:
try:
tokens = tokenize(raw_content)
tokens = esprima_tokenize(raw_content)
except Error:
return []
return list({t.value for t in tokens if t.type == 'String'})


def _extract_from_python_ast(raw_content: str) -> List[str]:
try:
ast_tree = ast.parse(raw_content)
except SyntaxError:
return []
return extract_all_constants_from_ast(ast_tree)


def _extract_from_python_code_comments(raw_content: str) -> List[str]:
string_constants = []
for line in tokenize.generate_tokens(io.StringIO(raw_content).readline):
if line.type == tokenize.COMMENT:
string_constants.append(line.string)
return string_constants

0 comments on commit bc986de

Please sign in to comment.