-
Notifications
You must be signed in to change notification settings - Fork 139
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
b2e4f6d
commit 3d339bf
Showing
2 changed files
with
294 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,160 @@ | ||
import jieba | ||
import jieba.posseg as jp | ||
import pdb | ||
import json | ||
import os | ||
import re | ||
from multiprocessing import Process, cpu_count | ||
# https://blog.csdn.net/xyisv/article/details/104482818 | ||
import hashlib | ||
import time | ||
image_name = re.compile(r'[0-9a-f]{18,64}') | ||
chapter2 = re.compile(r'[0-9]{1}\.[0-9]{1}') | ||
chapter3 = re.compile(r'[0-9]{1}\.[0-9]{1}\.[0-9]{1}') | ||
|
||
def load_stopwords(): | ||
sw = [] | ||
with open('cn_en_stopwords.txt') as f: | ||
for line in f: | ||
if len(line.strip()) > 0: | ||
sw.append(line.strip()) | ||
return sw | ||
|
||
def load_documents(n:int = 1): | ||
basedir = '/home/data/khj/workspace/huixiangdou/repodir.lda' | ||
|
||
docs = [] | ||
for root, _, files in os.walk(basedir): | ||
for file in files: | ||
if file.endswith('.jpg') or file.endswith('.png') or file.endswith('.jpeg'): | ||
pdb.set_trace() | ||
else: | ||
docs.append((file, os.path.join(root, file))) | ||
|
||
length = len(docs) | ||
step = length // n | ||
remainder = length % n | ||
|
||
result = [] | ||
start = 0 | ||
for i in range(n): | ||
end = start + step + (1 if i < remainder else 0) | ||
result.append(docs[start:end]) | ||
start = end | ||
|
||
return result | ||
|
||
def load_newwords(): | ||
words = [] | ||
basename = './newwords' | ||
files = os.listdir(basename) | ||
for filename in files: | ||
filepath = os.path.join(basename, filename) | ||
with open(filepath, encoding='utf8') as f: | ||
words += json.load(f) | ||
print('load {}'.format(filepath)) | ||
return words | ||
|
||
def content_hash(input_str:str): | ||
# 创建一个新的sha256 hash对象 | ||
hash_object = hashlib.sha256() | ||
# 更新hash对象,参数是输入字符串的编码(bytes) | ||
hash_object.update(input_str.encode()) | ||
# 获取十六进制的hash值 | ||
hex_dig = hash_object.hexdigest() | ||
# 返回前6位 | ||
return hex_dig[:6] | ||
|
||
def process_data(documents: list, pid: int): | ||
# add newwords | ||
t0 = time.time() | ||
new_words = load_newwords() | ||
for w in new_words: | ||
jieba.add_word(w, tag='n') | ||
|
||
stop_words = load_stopwords() | ||
print('{} start..'.format(pid)) | ||
bad_patterns = [image_name, chapter2, chapter3] | ||
|
||
for filename,filepath in documents: | ||
d = '' | ||
with open(filepath) as f: | ||
d = f.read() | ||
# use half content | ||
head_length = int(len(d) * 0.8) | ||
d = d[0:head_length] | ||
|
||
cuts = [w.word for w in jp.cut(d)] | ||
|
||
filtered = [] | ||
for c in cuts: | ||
c = c.strip() | ||
if c in stop_words: | ||
continue | ||
|
||
if 'images' == c: | ||
continue | ||
|
||
skip = False | ||
for bad_pattern in bad_patterns: | ||
if bad_pattern.match(c): | ||
skip = True | ||
break | ||
if skip: | ||
continue | ||
|
||
filtered.append(c) | ||
|
||
if len(filtered) < 1: | ||
continue | ||
new_content = ' '.join(filtered) | ||
|
||
if len(new_content) < 300: | ||
continue | ||
dirname = os.path.join('preprocess', str(pid)) | ||
if not os.path.exists(dirname): | ||
os.makedirs(dirname) | ||
|
||
hashname = content_hash(new_content) | ||
outfilepath = os.path.join(dirname, hashname + '.md') | ||
|
||
with open('name_map.txt', 'a') as f: | ||
f.write('{}\t {}'.format(hashname, filepath)) | ||
f.write('\n') | ||
|
||
with open(outfilepath, 'w') as f: | ||
f.write(new_content) | ||
f.flush() | ||
print('{} finish, timecost {}'.format(pid, time.time() - t0)) | ||
|
||
def _get_num_processes(): | ||
num_processes = cpu_count() - 1 # Good habit to leave 1 core. | ||
return num_processes | ||
|
||
def main(): | ||
debug_mode = False | ||
|
||
processes = [] | ||
split_documents = load_documents(n=_get_num_processes()) | ||
for process_id, documents in enumerate(split_documents): | ||
print(f'Distributing to process[{process_id}]...') | ||
|
||
if debug_mode: | ||
process_data(documents, process_id) | ||
else: | ||
# convert NDArray back to a list, easier. | ||
process = Process( | ||
target=process_data, | ||
args=( | ||
documents, | ||
process_id, | ||
), | ||
) | ||
process.start() | ||
print(f'Distributed to process[{process_id}].') | ||
processes.append(process) | ||
for process in processes: | ||
process.join() | ||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,134 @@ | ||
# Author: Olivier Grisel <[email protected]> | ||
# Lars Buitinck | ||
# Chyi-Kwei Yau <[email protected]> | ||
# License: BSD 3 clause | ||
|
||
from time import time | ||
import shutil | ||
import matplotlib.pyplot as plt | ||
import pdb | ||
import os | ||
import numpy as np | ||
|
||
from sklearn.datasets import fetch_20newsgroups | ||
from sklearn.decomposition import LatentDirichletAllocation | ||
from sklearn.feature_extraction.text import CountVectorizer | ||
import jieba | ||
import jieba.posseg as jp | ||
import json | ||
import re | ||
from multiprocessing import Process, cpu_count | ||
# https://blog.csdn.net/xyisv/article/details/104482818 | ||
import pickle as pkl | ||
|
||
n_features = 2048 | ||
n_components = 100 | ||
n_top_words = 100 | ||
batch_size = 128 | ||
|
||
def files(): | ||
basedir = '/home/data/khj/workspace/huixiangdou/lda/preprocess' | ||
|
||
docs = [] | ||
for root, _, files in os.walk(basedir): | ||
for file in files: | ||
if file.endswith('.jpg') or file.endswith('.png') or file.endswith('.jpeg'): | ||
pdb.set_trace() | ||
else: | ||
docs.append((file, os.path.join(root, file))) | ||
return docs | ||
|
||
def filecontents(dirname:str): | ||
filepaths = files() | ||
for _, filepath in filepaths: | ||
with open(filepath) as f: | ||
content = f.read() | ||
if len(content) > 0: | ||
yield content | ||
|
||
def load_namemap(): | ||
namemap = dict() | ||
with open('name_map.txt') as f: | ||
for line in f: | ||
parts = line.split('\t') | ||
namemap[parts[0].strip()] = parts[1].strip() | ||
return namemap | ||
|
||
# reference step https://blog.csdn.net/xyisv/article/details/104482818 | ||
def plot_top_words(model, feature_names, n_top_words, title): | ||
fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True) | ||
axes = axes.flatten() | ||
for topic_idx, topic in enumerate(model.components_): | ||
top_features_ind = topic.argsort()[-n_top_words:] | ||
top_features = feature_names[top_features_ind] | ||
weights = topic[top_features_ind] | ||
|
||
ax = axes[topic_idx] | ||
ax.barh(top_features, weights, height=0.7) | ||
ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 30}) | ||
ax.tick_params(axis="both", which="major", labelsize=20) | ||
for i in "top right left".split(): | ||
ax.spines[i].set_visible(False) | ||
fig.suptitle(title, fontsize=40) | ||
|
||
plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3) | ||
plt.savefig('topic_centers.jpg') | ||
|
||
def build_topic(dirname: str='preprocess'): | ||
namemap = load_namemap() | ||
pdb.set_trace() | ||
|
||
tf_vectorizer = CountVectorizer( | ||
max_df=0.95, min_df=2, max_features=n_features, stop_words="english" | ||
) | ||
|
||
t0 = time() | ||
tf = tf_vectorizer.fit_transform(filecontents(dirname)) | ||
print("BoW in %0.3fs." % (time() - t0)) | ||
|
||
lda = LatentDirichletAllocation( | ||
n_components=n_components, | ||
max_iter=5, | ||
learning_method="online", | ||
learning_offset=50.0, | ||
random_state=0, | ||
) | ||
t0 = time() | ||
doc_types = lda.fit_transform(tf) | ||
|
||
pdb.set_trace() | ||
print("lda train in %0.3fs." % (time() - t0)) | ||
# transform(raw_documents)[source] | ||
feature_names = tf_vectorizer.get_feature_names_out() | ||
|
||
models = {'CountVectorizer': tf_vectorizer, 'LatentDirichletAllocation': lda} | ||
with open('lda_models.pkl', 'wb') as model_file: | ||
pkl.dump(models, model_file) | ||
|
||
top_features_list = [] | ||
for _, topic in enumerate(lda.components_): | ||
top_features_ind = topic.argsort()[-n_top_words:] | ||
top_features = feature_names[top_features_ind] | ||
weights = topic[top_features_ind] | ||
top_features_list.append(top_features.tolist()) | ||
|
||
with open(os.path.join('cluster', 'desc.json'), 'w') as f: | ||
json_str = json.dumps(top_features_list, ensure_ascii=False) | ||
f.write(json_str) | ||
|
||
filepaths = files() | ||
|
||
pdb.set_trace() | ||
for file_id, doc_score in enumerate(doc_types): | ||
basename, input_filepath = filepaths[file_id] | ||
hashname = basename.split('.')[0] | ||
source_filepath = namemap[hashname] | ||
indices_np = np.where(doc_score > 0.1)[0] | ||
for topic_id in indices_np: | ||
target_dir = os.path.join('cluster', str(topic_id)) | ||
if not os.path.exists(target_dir): | ||
os.makedirs(target_dir) | ||
shutil.copy(source_filepath, target_dir) | ||
|
||
if __name__ == '__main__': | ||
build_topic() |