Skip to content

Commit

Permalink
add module for translating src docs
Browse files Browse the repository at this point in the history
  • Loading branch information
dvzubarev committed May 5, 2020
1 parent d941580 commit 5af463b
Show file tree
Hide file tree
Showing 12 changed files with 581 additions and 56 deletions.
15 changes: 15 additions & 0 deletions bin/translate_src_docs
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/usr/bin/env python
# coding: utf-8

import logging
import sys
import os.path as fs

sys.path.insert(0,
fs.dirname(fs.dirname(fs.realpath(__file__))))

import plag_submissions_utils.translate_src_docs as cli


if __name__ == '__main__':
cli.main()
6 changes: 6 additions & 0 deletions plag_submissions_utils/common/chunks.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ def __init__(self, orig_text, mod_text,

self._orig_doc = orig_doc

def get_id(self):
return self.get_chunk_id()

def get_chunk_id(self):
return self._chunk_num
Expand All @@ -120,9 +122,13 @@ def get_mod_type(self):
return self._mod_types[0]
return ModType.UNK

def set_mod_types(self, mod_types):
self._mod_types = mod_types

def get_all_mod_types(self):
return self._mod_types


def has_mod_type(self, mod_type):
return mod_type in self._mod_types

Expand Down
11 changes: 11 additions & 0 deletions plag_submissions_utils/common/sents.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ class SentsHolder(object):
"""
def __init__(self, text, opts, segment = False):
super(SentsHolder, self).__init__()
self._opts = opts
if isinstance(text, (list, )):
#It is possible in essays of version 2.
#Original text is already segmented by writer!
Expand Down Expand Up @@ -49,6 +50,16 @@ def get_avg_words_cnt(self):
def get_sent_info(self, sent_num):
return self._sent_infos[sent_num]

def add_sent(self, sent, tokenize = True):
self._sents.append(sent)
if tokenize:
tokens = text_proc.tok_sent(sent,
normalize = self._opts.normalize,
skip_stop_words = self._opts.skip_stop_words)
self._sent_tokens.append(tokens)
self._sent_infos.append(SentInfo(len(tokens)))


def get_sents(self):
return self._sents

Expand Down
8 changes: 6 additions & 2 deletions plag_submissions_utils/common/source_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,9 +119,13 @@ def _try_sequence_matcher(self, sent):
def is_sent_in_doc(self, sent):
return self.get_sent_offs(sent) is not None

def get_sent_offs(self, sent):
def get_sent_offs(self, sent,
preproc_sent = True):

text = sent
if preproc_sent:
text = text_proc.preprocess_text(text.strip())

text = text_proc.preprocess_text(sent.strip())
if not text:
raise RuntimeError("no text left after text preprocessing")
logging.debug("stripped text: %s", text)
Expand Down
1 change: 1 addition & 0 deletions plag_submissions_utils/common/src_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def get_src(self, res_id):
return None

def to_csv(self, out):
out.write("susp_id,filename,textapp_id,md5,ext_id\n")
out.write("\n".join(s.to_csv_record() for s in self._srcs.viewvalues()))

def from_csv(self, file_path):
Expand Down
6 changes: 5 additions & 1 deletion plag_submissions_utils/common/submissions.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
from .extract_utils import extract_submission
from .version import determine_version_by_id

def run_over_submissions(subm_dir, arc_proc, limit_by_version = None):
def run_over_submissions(subm_dir, arc_proc, limit_by_version = None,
include_ids_set = None):
entries = os.listdir(subm_dir)
for entry in entries:
temp_dir = None
Expand All @@ -23,6 +24,9 @@ def run_over_submissions(subm_dir, arc_proc, limit_by_version = None):
if limit_by_version != determine_version_by_id(susp_id):
continue

if include_ids_set and int(susp_id) not in include_ids_set:
continue

arc_path = glob.glob(arc_dir + "/*")
if not arc_path:
logging.warning("empty submission dir %s", arc_dir)
Expand Down
19 changes: 8 additions & 11 deletions plag_submissions_utils/common/translated_chunks.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,13 @@ def _create_translation_type(translation_str, orig_str):
tls = translation_str.strip().lower()
if tls == "yandex":
return TranslatorType.YANDEX
elif tls == "google":
if tls == "google":
return TranslatorType.GOOGLE
elif tls == "-" and len(orig_str) == 0:
if tls == "original" or (tls == '-' and not orig_str):
return TranslatorType.ORIGINAL
elif tls == "-":
if tls == "manual" or tls == '-':
return TranslatorType.MANUAL
else:
return chunks.ModType.UNK
return chunks.ModType.UNK


class TranslatedChunk(chunks.Chunk):
Expand All @@ -66,12 +65,7 @@ def get_translator_type(self):
return TranslatorType.UNK

def get_translator_type_str(self):

if self.get_translator_type() == TranslatorType.GOOGLE:
return 'google'
if self.get_translator_type() == TranslatorType.YANDEX:
return 'yandex'
return 'unk'
return translation_types_to_str(self._translator_types)

def get_all_translator_types(self):
return self._translator_types
Expand Down Expand Up @@ -101,3 +95,6 @@ def get_translated_sents(self):

def get_translated_tokens(self):
return self._translated_sents.get_all_tokens()

def get_translated_text(self):
return self._translated_sents.get_text()
93 changes: 85 additions & 8 deletions plag_submissions_utils/gen_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,10 @@ def __init__(self, opts):
self._mapping = src_mapping.SrcMap()
self._mapping.from_csv(opts.mapping)
else:
ids = _load_ids(opts.ids_file)
self._mapping = create_mapping(opts.subm_dir,
use_filename_as_id = opts.use_filename_as_id)
use_filename_as_id = opts.use_filename_as_id,
ids = ids)

self._src_map = None
self._init_src_map()
Expand Down Expand Up @@ -88,6 +90,35 @@ def on_susp_end(self, susp_doc):
def get_name(self):
return 'SrcRetrievalMetaGenerator'

class SimilarDocumentsMetaGenerator(SrcRetrievalMetaGenerator):
def __init__(self, opts):
super(SimilarDocumentsMetaGenerator, self).__init__(opts)

self._out_path = fs.join(
self._opts.src_retr_out_dir, "retrieval_data.csv")

with open(self._out_path, 'w') as outf:
outf.write("srcID,srcTitle,dstID,dstTitle,rank,reused_sent_cnt\n")


def on_susp_end(self, susp_doc):
susp_id = susp_doc.get_susp_id()

items = list(self._src_map.iteritems())
items.sort(key = lambda t : -t[1])

with open(self._out_path, 'a') as outf:
for num, (src_id, reused_sent_cnt) in enumerate(items):
if reused_sent_cnt < self._opts.min_sent_cnt:
continue
outf.write("%s,,%s,,%d,%d\n" % (susp_id, src_id, num + 1, reused_sent_cnt))


self._init_src_map()


def get_name(self):
return 'SimilarDocumentsMetaGenerator'

class TextAlignmentMetaGenerator(object):
def __init__(self, opts):
Expand Down Expand Up @@ -204,7 +235,7 @@ def __init__(self, opts, out_pipes):
self._out_pipes = out_pipes

def process_chunk(self, susp_doc, chunk, sources):
if chunk.get_mod_type() == ModType.ORIG:
if chunk.get_mod_type() == ModType.ORIG or not chunk.get_orig_doc_filename():
return
source = sources[chunk.get_orig_doc_filename()]
for sent in chunk.get_orig_sents():
Expand Down Expand Up @@ -251,9 +282,11 @@ def process_archive(self, archive_path, susp_id):


def process_submissions(self):
ids = _load_ids(self._opts.ids_file)
run_over_submissions(self._opts.subm_dir,
self.process_extracted_archive,
self._opts.limit_by_version)
self._opts.limit_by_version,
include_ids_set = ids)


class SuspDocGenerator(object):
Expand Down Expand Up @@ -306,17 +339,24 @@ def write_sources_to_files(mapping, susp_id, sources, out_dir,


def create_mapping(subm_dir, limit_by_version = None,
use_filename_as_id = False):
use_filename_as_id = False, ids = None):
mapping = src_mapping.SrcMap()

def arc_proc(susp_id, sources_dir, _):
src_mapping.add_src_from_dir(susp_id, sources_dir, mapping,
use_filename_as_id)

run_over_submissions(subm_dir, arc_proc, limit_by_version)
run_over_submissions(subm_dir, arc_proc, limit_by_version,
include_ids_set = ids)
return mapping

#cli support
def _load_ids(ids_file):
if ids_file:
with open(ids_file, 'r') as f:
return frozenset([int(l) for l in f])
return None


def create_text_align_meta(opts):
pipes = [TextAlignmentMetaGenerator(opts)]
Expand All @@ -328,6 +368,12 @@ def create_src_retr_meta(opts):
gener = Generator(opts, pipes)
gener.process_submissions()


def create_doc_sim_meta(opts):
pipes = [SimilarDocumentsMetaGenerator(opts)]
gener = Generator(opts, pipes)
gener.process_submissions()

def create_pan_meta(opts):
pipes = [SrcRetrievalMetaGenerator(opts),
TextAlignmentMetaGenerator(opts)]
Expand All @@ -341,12 +387,14 @@ def dumb_dump(opts):
gener.process_submissions()

def gen_map(opts):
ids = _load_ids(opts.ids_file)
mapping = create_mapping(opts.subm_dir, opts.limit_by_version,
opts.use_filename_as_id)
opts.use_filename_as_id, ids)
with open(opts.mapping_file, 'w') as f:
mapping.to_csv(f)

def create_sources(opts):
ids = _load_ids(opts.ids_file)
mapping = src_mapping.SrcMap()
mapping.from_csv(opts.mapping)

Expand All @@ -355,7 +403,8 @@ def arc_proc(susp_id, sources_dir, _):
write_sources_to_files(mapping, susp_id, sources, opts.out_dir,
opts.ext_id_as_filename)

run_over_submissions(opts.subm_dir, arc_proc, opts.limit_by_version)
run_over_submissions(opts.subm_dir, arc_proc, opts.limit_by_version,
include_ids_set = ids)

def create_susp_docs(opts):
def arc_proc(susp_id, _, meta_file_path):
Expand All @@ -368,7 +417,9 @@ def arc_proc(susp_id, _, meta_file_path):
if not fs.exists(opts.out_dir):
os.makedirs(opts.out_dir)

run_over_submissions(opts.subm_dir, arc_proc, opts.limit_by_version)
ids = _load_ids(opts.ids_file)
run_over_submissions(opts.subm_dir, arc_proc, opts.limit_by_version,
include_ids_set = ids)


def main():
Expand Down Expand Up @@ -401,6 +452,9 @@ def main():
gen_map_parser.add_argument("--mapping_file", "-o", default = "src_mapping.csv",
help = "mapping file path")
gen_map_parser.add_argument("--use_filename_as_id", "-u", action='store_true')
gen_map_parser.add_argument("--ids_file", "-I", default='',
help = "use only those ids, otherwise process everything")


gen_map_parser.set_defaults(func = gen_map)

Expand All @@ -413,13 +467,17 @@ def main():
help = "mapping file path")
create_src_parser.add_argument("--out_dir", "-o", default="essay_src")
create_src_parser.add_argument("--ext_id_as_filename", "-e", action="store_true")
create_src_parser.add_argument("--ids_file", "-I", default='',
help = "use only those ids, otherwise process everything")
create_src_parser.set_defaults(func = create_sources)

create_susp_parser = subparsers.add_parser('create_susp',
help='help of create_susp')
create_susp_parser.add_argument("--subm_dir", "-i", required = True,
help = "directory with submissions")
create_susp_parser.add_argument("--out_dir", "-o", default="essay_susp")
create_susp_parser.add_argument("--ids_file", "-I", default='',
help = "use only those ids, otherwise process everything")
create_susp_parser.set_defaults(func = create_susp_docs)

text_align_parser = subparsers.add_parser('text_align',
Expand All @@ -431,6 +489,8 @@ def main():
text_align_parser.add_argument("--mapping", "-m", default = "src_mapping.csv",
help = "mapping file path")
text_align_parser.add_argument("--use_filename_as_id", "-u", action='store_true')
text_align_parser.add_argument("--ids_file", "-I", default='',
help = "use only those ids, otherwise process everything")
text_align_parser.set_defaults(func = create_text_align_meta)

src_retr_parser = subparsers.add_parser('src_retr',
Expand All @@ -442,8 +502,23 @@ def main():
help = "mapping file path")
src_retr_parser.add_argument("--min_sent_cnt", "-s", default=4, type=int)
src_retr_parser.add_argument("--use_filename_as_id", "-u", action='store_true')
src_retr_parser.add_argument("--ids_file", "-I", default='',
help = "use only those ids, otherwise process everything")
src_retr_parser.set_defaults(func = create_src_retr_meta)

dos_sim_parser = subparsers.add_parser('doc_sim',
help='help of src_retr')
dos_sim_parser.add_argument("--subm_dir", "-i", required = True,
help = "directory with submissions")
dos_sim_parser.add_argument("--src_retr_out_dir", "-o", default="doc_sim")
dos_sim_parser.add_argument("--mapping", "-m", default = "src_mapping.csv",
help = "mapping file path")
dos_sim_parser.add_argument("--min_sent_cnt", "-s", default=2, type=int)
dos_sim_parser.add_argument("--use_filename_as_id", "-u", action='store_true')
dos_sim_parser.add_argument("--ids_file", "-I", default='',
help = "use only those ids, otherwise process everything")
dos_sim_parser.set_defaults(func = create_doc_sim_meta)

pan_parser = subparsers.add_parser('pan',
help='help of pan')
pan_parser.add_argument("--subm_dir", "-i", required = True,
Expand All @@ -455,6 +530,8 @@ def main():
pan_parser.add_argument("--use_filename_as_id", "-u", action='store_true')
pan_parser.add_argument("--text_align_out_dir", "-t",
default="01-manual-plagiarism")
pan_parser.add_argument("--ids_file", "-I", default='',
help = "use only those ids, otherwise process everything")
pan_parser.set_defaults(func = create_pan_meta)
args = parser.parse_args()

Expand Down
Loading

0 comments on commit 5af463b

Please sign in to comment.