Skip to content

Merge to Mondego/SourcererCC master, fixed a bug which causes tokenizer not processing files, added 'c' 'h' file extensions to the ini file #3

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 16 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions tokenizers/all-file-level/c.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[Language]
separators = ; :: . -> [ ] ( ) ++ -- ~ ! - + & * .* ->* * / % << >> < > <= >= ++ != & ^ | && || ? == { } = # , " \\ : $
file_extensions = .cpp .hpp .c .h .C .cc .CPP .c++ .cp
comment_inline = //
comment_open_tag = /*
comment_close_tag = */
13 changes: 13 additions & 0 deletions tokenizers/all-file-level/config.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
[Main]
N_PROCESSES = 4
language_file = c.ini

[Folders/Files]
PATH_proj_paths = project-list.txt
PATH_tokens_folder = tokens
PATH_bookkeeping_file_folder = bookkeeping_files
PATH_bookkeeping_proj_folder = bookkeeping_projs
PATH_projects_success = projects_success.txt
PATH_project_starting_index = project_starting_index.txt
PATH_projects_fail = projects_fail.txt
PATH_mirror_repo = mirror_repo
1 change: 1 addition & 0 deletions tokenizers/all-file-level/src/tokenizer-directory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
import loggingfrom multiprocessing import Process, Value, Lockimport reimport osimport collectionsfrom lockfile import LockFiletry: from configparser import ConfigParserexcept ImportError: from ConfigParser import ConfigParser # ver. < 3.0# Logging codeFORMAT = '[%(levelname)s] (%(threadName)s) %(message)s'logging.basicConfig(level=logging.DEBUG,format=FORMAT)file_handler = logging.FileHandler('results.log')file_handler.setFormatter(logging.Formatter(FORMAT))logging.getLogger().addHandler(file_handler)config_file = 'config.ini'# instantiateconfig = ConfigParser()# parse existing filetry: config.read(config_file)except IOError: print 'config settings not fould' logging.error('Config file ['+config_file+'] not found') sys.exit()# Provided by the userN_PROCESSES = config.getint('Main', 'N_PROCESSES')language_file = config.get('Main', 'language_file')# FoldersPATH_proj_paths = config.get('Folders/Files', 'PATH_proj_paths')PATH_tokens_folder = config.get('Folders/Files', 'PATH_tokens_folder')PATH_bookkeeping_file_folder = config.get('Folders/Files', 'PATH_bookkeeping_file_folder')PATH_bookkeeping_proj_folder = config.get('Folders/Files', 'PATH_bookkeeping_proj_folder')PATH_projects_success = config.get('Folders/Files', 'PATH_projects_success')PATH_project_starting_index = config.get('Folders/Files', 'PATH_project_starting_index')PATH_projects_fail = config.get('Folders/Files', 'PATH_projects_fail')try: config.read(language_file)except IOError: print 'Language settings not fould' logging.error('Language settings ['+language_file+'] not found') sys.exit()# Read language settingsseparators = config.get('Language', 'separators').split(' ')file_extensions = config.get('Language', 'file_extensions').split(' ')comment_end_of_line = config.get('Language', 'comment_inline')comment_open_tag = re.escape(config.get('Language', 'comment_open_tag'))comment_close_tag = re.escape(config.get('Language', 'comment_close_tag'))def tokenizer(proj_id, proj_path, file_starting_id, FILE_tokens, FILE_bookkeeping_file, FILE_bookkeeping_proj): if not os.path.exists(proj_path): logging.error('Project not found <'+proj_id+','+proj_path+'>') # Important to have a global lock on this file because it is shared lock = LockFile(PATH_projects_fail) with lock: with open(PATH_projects_fail,'a+') as project_fail: project_fail.write(proj_path+'\n') return # In case process names need to be logged # process_name = '['+mp.current_process().name+'] ' # logging.info(process_name+'Starting proj <'+proj_id+','+proj_path+'>') all_files = [] for (dirpath, dirnames, filenames) in os.walk(proj_path): aux_list = [] for extension in file_extensions: aux = [x for x in filenames if x.endswith(extension)] #aux = [os.path.join(dirpath,x) for x in aux] aux_list.extend(aux) all_files.extend(aux_list) # Increment the shared file_starting_id by the amount of files in the current project lock = Lock() with lock: all_files = zip(range(file_starting_id.value, file_starting_id.value+len(all_files)), all_files) file_starting_id.value += len(all_files) for file_id, file_path in all_files: file_path = proj_path+'/'+file_path logging.info('Starting file <'+proj_id+','+str(file_id)+','+file_path+'>') try: with open(file_path,'r') as myfile: file_string = myfile.read() except IOError: logging.error('File not found <'+proj_id+','+str(file_id)+','+file_path+'>') continue # Remove enf of line comments file_string = re.sub(comment_end_of_line+'.*?\n','',file_string,flags=re.DOTALL) # Remove tagged comments file_string = re.sub(comment_open_tag+'.*?'+comment_close_tag,'',file_string,flags=re.DOTALL) #Transform separators into spaces (remove them) for x in separators: file_string = file_string.replace(x,' ') #Create a list of tokens file_string = file_string.split() # Total number of tokens tokens_count_total = len(file_string) #Count occurrences file_string = collections.Counter(file_string) #Converting Counter to dict because according to StackOverflow is better file_string=dict(file_string) # Unique number of tokens tokens_count_unique = len(file_string) tokens = [] #SourcererCC formatting for k, v in file_string.items(): tokens.append(k+'@@::@@'+str(v)) tokens = ','.join(tokens) FILE_bookkeeping_file.write(proj_id+','+str(file_id)+','+file_path+'\n') FILE_tokens.write(proj_id+','+str(file_id)+','+str(tokens_count_total)+','+str(tokens_count_unique)+'@#@'+tokens+'\n') FILE_bookkeeping_proj.write(proj_id+','+proj_path+'\n') # Important to have a global loc on this file because it is shared lock = LockFile(PATH_projects_success) with lock: with open(PATH_projects_success,'a+') as project_success: project_success.write(proj_path+'\n')def tokenize(list_projects, FILE_tokens_name, FILE_bookkeeping_file_name, FILE_bookkeeping_proj_name, file_starting_id): # Each tokenize will represent a new process with open(FILE_tokens_name, 'w') as FILE_tokens, \ open(FILE_bookkeeping_file_name, 'w') as FILE_bookkeeping_file, \ open(FILE_bookkeeping_proj_name, 'w') as FILE_bookkeeping_proj: for proj_id, proj_path in list_projects: tokenizer(str(proj_id), proj_path, file_starting_id, FILE_tokens, FILE_bookkeeping_file, FILE_bookkeeping_proj)if __name__ == '__main__': #In the main file we: # create directories if they do not exist # read list of PATH_projects_success, if exists, and do not process these again # each process needs a unique file with tokens and file and project # bookkeeping in the proper folders # start N_PROCESSES, and give them [(unique_id, proj_path)] if not os.path.exists(PATH_tokens_folder): os.makedirs(PATH_tokens_folder) if not os.path.exists(PATH_bookkeeping_file_folder): os.makedirs(PATH_bookkeeping_file_folder) if not os.path.exists(PATH_bookkeeping_proj_folder): os.makedirs(PATH_bookkeeping_proj_folder) proj_paths = [] with open(PATH_proj_paths) as f: for line in f: proj_paths.append(line.strip('\n')) projects_success = [] try: with open(PATH_projects_success,'r') as f: for line in f: projects_success.append(line.strip().strip('\n')) except IOError as e: logging.info('File '+PATH_projects_success+' no found') projects_starting_index = 0 proj_paths = list(set(proj_paths) - set(projects_success)) # Initialize projects_starting_index with previous logged number if not os.path.exists(PATH_project_starting_index): with open(PATH_project_starting_index, 'w') as FILE_project_starting_index: FILE_project_starting_index.write(str(len(proj_paths))+'\n') else: try: with open(PATH_project_starting_index, 'r') as FILE_project_starting_index: projects_starting_index = int(FILE_project_starting_index.readline().strip('\n')) except ValueError: projects_starting_index = 0 with open(PATH_project_starting_index, 'w') as FILE_project_starting_index: FILE_project_starting_index.write(str(projects_starting_index+len(proj_paths))+'\n') proj_paths = zip(range(projects_starting_index, len(proj_paths)+projects_starting_index),proj_paths) #Split list of projects into N_PROCESSES lists proj_paths_list = [ proj_paths[i::N_PROCESSES] for i in xrange(N_PROCESSES) ] # Multiprocessing with N_PROCESSES processes = [] # Multiprocessing shared variable instance for recording file_id file_starting_id = Value('i', 0) process_num = 0 n =0 for input_process in proj_paths_list: # Skip empty sublists if len(input_process) == 0: continue process_num += 1 FILE_tokens_name = PATH_tokens_folder+'/'+'tokens_'+str(n)+'.txt' FILE_bookkeeping_file_name = PATH_bookkeeping_file_folder+'/'+'bookkeeping_file_'+str(n)+'.txt' FILE_bookkeeping_proj_name = PATH_bookkeeping_proj_folder+'/'+'bookkeeping_proj_'+str(n)+'.txt' while (os.path.isfile(FILE_tokens_name) and os.path.isfile(FILE_bookkeeping_file_name) and os.path.isfile(FILE_bookkeeping_proj_name)): n += 1 FILE_tokens_name = PATH_tokens_folder+'/'+'tokens_'+str(n)+'.txt' FILE_bookkeeping_file_name = PATH_bookkeeping_file_folder+'/'+'bookkeeping_file_'+str(n)+'.txt' FILE_bookkeeping_proj_name = PATH_bookkeeping_proj_folder+'/'+'bookkeeping_proj_'+str(n)+'.txt' n += 1 processes.append(Process(name='Process '+str(process_num), target=tokenize, args=(input_process, FILE_tokens_name, FILE_bookkeeping_file_name, FILE_bookkeeping_proj_name, file_starting_id, ))) for proc in processes: proc.start() logging.info(proc.name) for proc in processes: proc.join()
Expand Down
283 changes: 283 additions & 0 deletions tokenizers/all-file-level/src/tokenizer-tar.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,283 @@
import logging
import multiprocessing as mp
from multiprocessing import Process
import re
import os
import collections
from lockfile import LockFile
import tarfile
import mimetypes
import sys
import hashlib

try:
from configparser import ConfigParser
except ImportError:
from ConfigParser import ConfigParser # ver. < 3.0

# Logging code
FORMAT = '[%(levelname)s] (%(threadName)s) %(message)s'
logging.basicConfig(level=logging.DEBUG,format=FORMAT)
file_handler = logging.FileHandler('results.log')
file_handler.setFormatter(logging.Formatter(FORMAT))
logging.getLogger().addHandler(file_handler)

config_file = 'config.ini'

# instantiate
config = ConfigParser()

# parse existing file
try:
config.read(config_file)
except IOError:
print 'config settings not fould'
logging.error('Config file ['+config_file+'] not found')
sys.exit()

N_PROCESSES = config.getint('Main', 'N_PROCESSES')
language_file = config.get('Main', 'language_file')

# Folders
PATH_proj_paths = config.get('Folders/Files', 'PATH_proj_paths')
PATH_tokens_folder = config.get('Folders/Files', 'PATH_tokens_folder')
PATH_bookkeeping_file_folder = config.get('Folders/Files', 'PATH_bookkeeping_file_folder')
PATH_bookkeeping_proj_folder = config.get('Folders/Files', 'PATH_bookkeeping_proj_folder')
PATH_projects_success = config.get('Folders/Files', 'PATH_projects_success')
PATH_project_starting_index = config.get('Folders/Files', 'PATH_project_starting_index')
PATH_projects_fail = config.get('Folders/Files', 'PATH_projects_fail')

try:
config.read(language_file)
except IOError:
print 'Language settings not fould'
logging.error('Language settings ['+language_file+'] not found')
sys.exit()

# Read language settings
separators = config.get('Language', 'separators').split(' ')
file_extensions = config.get('Language', 'file_extensions').split(' ')
comment_end_of_line = config.get('Language', 'comment_inline')
comment_open_tag = re.escape(config.get('Language', 'comment_open_tag'))
comment_close_tag = re.escape(config.get('Language', 'comment_close_tag'))


ALWAYS = ['@','@#@','@@::@@','#'] # These should be always part of the separators
separators.extend(ALWAYS)

# Some of the files we found happen to be binary, even if we their extension is something
# line *.cpp. Therefore we explore a behavior of file(1) to find if these files are binary
# http://stackoverflow.com/questions/32184809/python-file1-why-are-the-numbers-7-8-9-10-12-13-27-and-range0x20-0x100
textchars = bytearray({7,8,9,10,12,13,27} | set(range(0x20, 0x100)) - {0x7f})
is_binary_string = lambda bytes: bool(bytes.translate(None, textchars))

def tokenizer(proj_id, proj_path, FILE_tokens_name, FILE_bookkeeping_file_name, FILE_bookkeeping_proj_name):
logging.info('Starting project <'+proj_id+','+proj_path+'>')

if not os.path.isdir(proj_path):
logging.error('Unable to open project <'+proj_id+','+proj_path+'>')
lock = LockFile(PATH_projects_fail)
with lock:
with open(PATH_projects_fail,'a+') as project_failure:
project_failure.write(proj_path+'\n')
return

# Search for all tar files
tar_files = [os.path.join(proj_path, f) for f in os.listdir(proj_path) if os.path.isfile(os.path.join(proj_path, f))]
tar_files = [f for f in tar_files if '_code' in f]
if(len(tar_files) != 1):
logging.error('Tar not found on <'+proj_id+','+proj_path+'>')
# Important to have a global loc on this file because it is shared
lock = LockFile(PATH_projects_fail)
with lock:
with open(PATH_projects_fail,'a+') as project_fail:
project_fail.write(proj_path+'\n')
return

tar_file = tar_files[0]

try:
with tarfile.open(tar_file,'r') as my_tar_file:
# Get all members on the tar file
all_files = []
for member in my_tar_file.getmembers():
all_files.append(member.name)

# Filter them by the correct extension
aux = []
for extension in file_extensions:
aux.extend([x for x in all_files if x.endswith(extension)])
all_files = aux

# This is very strange, but I did find some paths with newlines,
# so I am simply eliminatins these
all_files = [x for x in all_files if '\n' not in x]

# In case process names need to be logged
# process_name = '['+mp.current_process().name+'] '

all_files = zip(range(0,len(all_files)),all_files)

for file_id, file_path in all_files:

logging.info('Starting file <'+proj_id+','+str(file_id)+','+os.path.join(tar_file,file_path)+'>')

try:
myfile = my_tar_file.extractfile(file_path)
except:
logging.error('Unable to open file (1) <'+proj_id+','+str(file_id)+','+os.path.join(tar_file,file_path)+'>')
break

if myfile is None:
logging.error('Unable to open file (2) <'+proj_id+','+str(file_id)+','+os.path.join(tar_file,file_path)+'>')
break

file_string = myfile.read()

if is_binary_string(file_string):
logging.error('Unable to open file (3) <'+proj_id+','+str(file_id)+','+os.path.join(tar_file,file_path)+'>')
break

# Remove enf of line comments
file_string = re.sub(comment_end_of_line+'.*?\n','',file_string,flags=re.DOTALL)
# Remove tagged comments
file_string = re.sub(comment_open_tag+'.*?'+comment_close_tag,'',file_string,flags=re.DOTALL)
#Transform separators into spaces (remove them)
for x in separators:
file_string = file_string.replace(x,' ')
#Create a list of tokens
file_string = file_string.split()
# Total number of tokens
tokens_count_total = len(file_string)
#Count occurrences
file_string = collections.Counter(file_string)
#Converting Counter to dict because according to StackOverflow is better
file_string=dict(file_string)
# Unique number of tokens
tokens_count_unique = len(file_string)

tokens = []
#SourcererCC formatting
for k, v in file_string.items():
tokens.append(k+'@@::@@'+str(v))
tokens = ','.join(tokens)

# MD5
m = hashlib.md5()
m.update(tokens)

with open(FILE_tokens_name, 'a+') as FILE_tokens_file:
FILE_tokens_file.write(proj_id+','+str(file_id)+','+str(tokens_count_total)+','+str(tokens_count_unique)\
+','+m.hexdigest()\
+'@#@'+tokens+'\n')

with open(FILE_bookkeeping_file_name, 'a+') as FILE_bookkeeping_file:
FILE_bookkeeping_file.write(proj_id+','+str(file_id)+','+os.path.join(tar_file,file_path)+'\n')

except Exception:
logging.error('Unable to open tar on <'+proj_id+','+proj_path+'>')
lock = LockFile(PATH_projects_fail)
with lock:
with open(PATH_projects_fail,'a+') as project_failure:
project_failure.write(proj_path+'\n')
return

with open(FILE_bookkeeping_proj_name, 'a+') as FILE_bookkeeping_proj:
FILE_bookkeeping_proj.write(proj_id+','+proj_path+'\n')

# Important to have a global loc on this file because it is shared
lock = LockFile(PATH_projects_success)
with lock:
with open(PATH_projects_success,'a+') as project_success:
project_success.write(proj_path+'\n')

logging.info('Project finished <'+proj_id+','+proj_path+'>')


def tokenize(list_projects, FILE_tokens_name, FILE_bookkeeping_file_name, FILE_bookkeeping_proj_name):

# Each tokenize will represent a new process
for proj_id, proj_path in list_projects:
tokenizer(str(proj_id), proj_path, FILE_tokens_name, FILE_bookkeeping_file_name, FILE_bookkeeping_proj_name)


if __name__ == '__main__':
#In the main file we:
# create directories if they do not exist
# read list of PATH_projects_success, if exists, and do not process these again
# each process needs a unique file with tokens and file and project
# bookkeeping in the proper folders
# start N_PROCESSES, and give them [(unique_id, proj_path)]

if not os.path.exists(PATH_tokens_folder):
os.makedirs(PATH_tokens_folder)
if not os.path.exists(PATH_bookkeeping_file_folder):
os.makedirs(PATH_bookkeeping_file_folder)
if not os.path.exists(PATH_bookkeeping_proj_folder):
os.makedirs(PATH_bookkeeping_proj_folder)

proj_paths = []
with open(PATH_proj_paths) as f:
for line in f:
proj_paths.append(line.strip('\n'))

projects_success = []
try:
with open(PATH_projects_success,'r') as f:
for line in f:
projects_success.append(line.strip().strip('\n'))
except IOError as e:
logging.info('File '+PATH_projects_success+' no found')

projects_starting_index = 0
proj_paths = list(set(proj_paths) - set(projects_success))

# Initialize projects_starting_index with previous logged number
if not os.path.exists(PATH_project_starting_index):
with open(PATH_project_starting_index, 'w') as FILE_project_starting_index:
FILE_project_starting_index.write(str(len(proj_paths))+'\n')
else:
try:
with open(PATH_project_starting_index, 'r') as FILE_project_starting_index:
projects_starting_index = int(FILE_project_starting_index.readline().strip('\n'))
except ValueError:
projects_starting_index = 0

with open(PATH_project_starting_index, 'w') as FILE_project_starting_index:
FILE_project_starting_index.write(str(projects_starting_index+len(proj_paths))+'\n')

proj_paths = zip(range(projects_starting_index, len(proj_paths)+projects_starting_index),proj_paths)

#Split list of projects into N_PROCESSES lists
proj_paths_list = [ proj_paths[i::N_PROCESSES] for i in xrange(N_PROCESSES) ]

# Multiprocessing with N_PROCESSES
processes = []
process_num = 0
n =0
for input_process in proj_paths_list:

# Skip empty sublists
if len(input_process) == 0:
continue

process_num += 1
FILE_tokens_name = PATH_tokens_folder+'/'+'tokens_'+str(n)+'.txt'
FILE_bookkeeping_file_name = PATH_bookkeeping_file_folder+'/'+'bookkeeping_file_'+str(n)+'.txt'
FILE_bookkeeping_proj_name = PATH_bookkeeping_proj_folder+'/'+'bookkeeping_proj_'+str(n)+'.txt'

while (os.path.isfile(FILE_tokens_name) and os.path.isfile(FILE_bookkeeping_file_name) and os.path.isfile(FILE_bookkeeping_proj_name)):
n += 1
FILE_tokens_name = PATH_tokens_folder+'/'+'tokens_'+str(n)+'.txt'
FILE_bookkeeping_file_name = PATH_bookkeeping_file_folder+'/'+'bookkeeping_file_'+str(n)+'.txt'
FILE_bookkeeping_proj_name = PATH_bookkeeping_proj_folder+'/'+'bookkeeping_proj_'+str(n)+'.txt'

n += 1
processes.append(Process(name='Process '+str(process_num), target=tokenize, args=(input_process, FILE_tokens_name, FILE_bookkeeping_file_name, FILE_bookkeeping_proj_name,)))

for proc in processes:
proc.start()
logging.info(proc.name)

for proc in processes:
proc.join()
Loading