Skip to content

Commit

Permalink
add: 新增多线程批处理方式
Browse files Browse the repository at this point in the history
  • Loading branch information
carolcoral committed Sep 14, 2023
1 parent a3cd4c9 commit 61506ff
Show file tree
Hide file tree
Showing 4 changed files with 182 additions and 67 deletions.
26 changes: 26 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,32 @@ python3 main.py --dir_path "example/movies","example/tvs" --output data/metadata
nohup python3 main.py > nohup.log 2>&1 & echo &! > run.pid
```

### 多线程刮削
> 前置要求:需要先执行main.py脚本的"collect"模式收集nfo元数据文件
#### 1. 直接修改脚本文件方式
1. 修改 `multi_thread.py` 文件中 `if __name__ == '__main__':` 方法中 `__dir_path``__output``__tmdb_token``__mode`参数值
2. 执行脚本
```python
python3 multi_thread.py
```

#### 2. 命令行执行
> 注意参数 `--dir_path` 的值如果需要配置多个,请使用英文半角逗号拼接,不要有空格
```python
python3 multi_thread.py --dir_path "example/movies","example/tvs" --output data/metadata/person --tmdb_token tmdb_token
```

#### 3. 后台执行
> 可以结合前两种执行方式使用
```shell
nohup python3 multi_thread.py > nohup.log 2>&1 & echo &! > run.pid
```



### 补充
1. 运行提示 `no module name requests` 但是实际python环境中又安装了的:
* 查看当前执行的python版本:```python --version```
Expand Down
57 changes: 35 additions & 22 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from utils.collect_metadata import __collect_nfo
from utils.redo import __redo, __check
from utils.scrape import __execute
from utils.scrape import Scrape
from utils.LoggerUtil import Logger


Expand Down Expand Up @@ -82,6 +82,12 @@ def __get_sys_args(log):
"collect(元数据文件转移)/scrape(元数据刮削)/redo(重新刮削异常元数据)"))
raise SystemExit(1)
arg_json["__mode"] = mode_value
if "--language" not in arg_key.keys():
log.logger.warn("未输入脚本执行语言,默认使用中文简体语言格式:{0}".format("--language"))
arg_json["__language"] = "zh-CN"
else:
mode_value = sys.argv[arg_key["--language"] + 1]
arg_json["__language"] = mode_value
return arg_json


Expand All @@ -92,18 +98,40 @@ def __create_default_dirs():
os.makedirs("./redo")


def __master_execute(log, dir_path, output, tmdb_token, mode, language="zh-CN"):
# 检查python版本
__check_version(log=log)
# 开始执行主程序
__create_default_dirs()
# 默认 language="zh-CN" (简体中文),可以通过修改 "language" 的值变更获取元数据的语言类别
for __real_dir_path in dir_path:
if "collect" == mode:
__collect_nfo(log, __real_dir_path, output)
if "scrape" == mode:
# 删除异常信息存储文件
error_file_path = "./error_tmdb_ids.txt"
if os.path.exists(error_file_path):
os.remove(error_file_path)
scrape = Scrape(log=log, dir_path=__real_dir_path, output=output, tmdb_token=tmdb_token, language=language)
scrape.start()
if "redo" == mode:
__redo(log=log, output=output, tmdb_token=tmdb_token, language=language)
if "check" == mode:
__check(scan_path=output)


if __name__ == '__main__':
# 初始化日志
__log = __init_logger()
sys_args = __get_sys_args(log=__log)
# 扫描目录
# __dir_path = ["/volume2/video/animation", "/volume2/video/children", "/volume2/video/documentary", "/volume2/video/movies", "/volume2/video/tvs", "/volume2/video/variety"]
__dir_path = ["/Users/liuxuewen/workspace/self/gitea/tmdb-person/data/metadata/nfo"]
__dir_path = ["/data/tmdb-person/data/metadata/nfo"]
# 输出演员元数据目录
__output = "/Users/liuxuewen/workspace/self/gitea/tmdb-person/data/metadata/person"
__output = "/data/tmdb-person/data/metadata/person"
# TMDB API TOKEN
__tmdb_token = "eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiIxYTU4ODAxMGY5OTUwYWEyNThhYjFhYjJlMjI4NGVmYSIsInN1YiI6IjYxYmRmOGNjMzgzZGYyMDA0MjIzNDhjOSIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.RPG8F8AELlK7MgrXDR2U0YRv61VteZZ9ponilnkQqkE"
__mode = "scrape"
__language = "zh-CN"
if len(sys_args.keys()) > 0:
# 扫描目录
__dir_path = sys_args["__dir_path"]
Expand All @@ -112,21 +140,6 @@ def __create_default_dirs():
# TMDB API TOKEN
__tmdb_token = sys_args["__tmdb_token"]
__mode = sys_args["__mode"]
# 检查python版本
__check_version(log=__log)
# 开始执行主程序
__create_default_dirs()
# 默认 language="zh-CN" (简体中文),可以通过修改 "language" 的值变更获取元数据的语言类别
for __real_dir_path in __dir_path:
if "collect" == __mode:
__collect_nfo(__log, __real_dir_path, __output)
if "scrape" == __mode:
# 删除异常信息存储文件
error_file_path = "./error_tmdb_ids.txt"
if os.path.exists(error_file_path):
os.remove(error_file_path)
__execute(log=__log, dir_path=__real_dir_path, output=__output, tmdb_token=__tmdb_token)
if "redo" == __mode:
__redo(log=__log, output=__output, tmdb_token=__tmdb_token)
if "check" == __mode:
__check(scan_path=__output)
__language = sys_args["__language"]
__master_execute(log=__log, dir_path=__dir_path, output=__output, tmdb_token=__tmdb_token, mode=__mode,
language=__language)
61 changes: 61 additions & 0 deletions multi_thread.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# 多线程模式执行脚本
import os
import shutil

from main import __init_logger, __get_sys_args, __check_version, __create_default_dirs
from utils.scrape import Scrape


def __cut_dirs(log, dir_path, output):
log.logger.info("开始执行元数据文件分组:{0}".format(dir_path))
# 默认 language="zh-CN" (简体中文),可以通过修改 "language" 的值变更获取元数据的语言类别
# 将nfo文件根据首字母小些切分成不同的文件夹
nfo_list = os.path.join(os.path.dirname(output), "nfo_list")
if not os.path.exists(nfo_list):
os.makedirs(nfo_list)
for nfo_file_dir in dir_path:
for nfo_file in os.listdir(nfo_file_dir):
__name = nfo_file[0].lower()
__path_dir = os.path.join(nfo_list, __name)
if not os.path.exists(__path_dir):
os.makedirs(__path_dir)
shutil.copyfile(os.path.join(nfo_file_dir, nfo_file), os.path.join(__path_dir, nfo_file))
log.logger.info("结束执行元数据文件分组:{0}".format(nfo_list))
return nfo_list


if __name__ == '__main__':
# 初始化日志
__log = __init_logger()
sys_args = __get_sys_args(log=__log)
# 扫描目录
__dir_path = ["data/metadata/nfo"]
# 输出演员元数据目录
__output = "data/metadata/person"
# TMDB API TOKEN
__tmdb_token = "eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiIxYTU4ODAxMGY5OTUwYWEyNThhYjFhYjJlMjI4NGVmYSIsInN1YiI6IjYxYmRmOGNjMzgzZGYyMDA0MjIzNDhjOSIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.RPG8F8AELlK7MgrXDR2U0YRv61VteZZ9ponilnkQqkE"
__mode = "scrape"
__language = "zh-CN"
if len(sys_args.keys()) > 0:
# 扫描目录
__dir_path = sys_args["__dir_path"]
# 输出演员元数据目录
__output = sys_args["__output"]
# TMDB API TOKEN
__tmdb_token = sys_args["__tmdb_token"]
__language = sys_args["__language"]
# 检查python版本
__check_version(log=__log)
# 开始执行主程序
__create_default_dirs()
__nfo_list = __cut_dirs(log=__log, dir_path=__dir_path, output=__output)
# 删除异常信息存储文件
error_file_path = "./error_tmdb_ids.txt"
if os.path.exists(error_file_path):
os.remove(error_file_path)
for dir_name in os.listdir(__nfo_list):
scrape = Scrape(log=__log, dir_path=os.path.join(__nfo_list, dir_name), output=__output,
tmdb_token=__tmdb_token, language=__language)
scrape.start()
105 changes: 60 additions & 45 deletions utils/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,53 +2,68 @@
# -*- coding: utf-8 -*-
import os
import shutil
import threading
import uuid

from utils.analyze import Analyze
from utils.tmdb import Tmdb

thread_lock = threading.Lock()

def __execute(log, dir_path, output, tmdb_token, language="zh-CN"):
log.logger.info("------------------- 开始获取演员元数据及海报 -------------------")
__file_paths = []
log.logger.info("当前执行元数据刮削识别的根文件夹:{0}".format(dir_path))
for folder in os.listdir(dir_path):
__folder2 = os.path.join(dir_path, folder)
# 判断是否文件夹
if os.path.isdir(__folder2):
for nfo_file in os.listdir(__folder2):
__child_file_path = os.path.join(__folder2, nfo_file)
if ".nfo" in os.path.basename(__child_file_path):
__file_paths.append(__child_file_path)
elif os.path.isfile(__folder2):
__file_name = os.path.basename(__folder2)
if ".nfo" in __file_name:
__file_paths.append(__folder2)
for __file_path in __file_paths:
log.logger.info("开始处理元数据刮削识别:{0}".format(__file_path))
# __file_path = "example/神出鬼没 (2023) - 2160p.nfo"
__nfo_data = Analyze(file_path=__file_path).analyze()
for __actor in __nfo_data["actors"]:
log.logger.info("当前解析的演员信息: {0}".format(__actor))
if "tmdbid" in __actor.keys():
__tmdbid = __actor["tmdbid"]
__actor_name = __actor["name"]
__name = __actor_name[0].lower()
__full_actor_name = __actor_name + "-tmdb-" + __tmdbid
__path_dir = os.path.join(output, __name, __full_actor_name)
if not os.path.exists(__path_dir):
os.makedirs(__path_dir)
# 如果存在元数据则不再进行刮削
if "person.nfo" not in os.listdir(__path_dir):
Tmdb(log=log, tmdb_id=__tmdbid, actor_path=__path_dir, tmdb_token=tmdb_token,
language=language).create_actor_nfo()
else:
log.logger.info("当前路径已存在person.nfo文件, 跳过刮削:{0}".format(__path_dir))
# 如果存在海报则不再进行刮削
if "folder.jpg" not in os.listdir(__path_dir):
Tmdb(log=log, tmdb_id=__tmdbid, actor_path=__path_dir, tmdb_token=tmdb_token,
language=language).get_actor_image()
else:
log.logger.info("当前路径已存在folder.jpg文件, 跳过刮削:{0}".format(__path_dir))
# 移动完成刮削的nfo文件到complete文件夹
shutil.move(__file_path, "complete/")
log.logger.info("------------------- 结束获取演员元数据及海报 -------------------")
class Scrape(threading.Thread):
def __init__(self, log, dir_path, output, tmdb_token, language="zh-CN", thread_id=uuid.uuid4()):
threading.Thread.__init__(self)
self.thread_id = thread_id
self.log = log
self.dir_path = dir_path
self.output = output
self.tmdb_token = tmdb_token
self.language = language

def run(self):
self.log.logger.info("------------------- 开始获取演员元数据及海报:{0} -------------------".format(self.thread_id))
__file_paths = []
self.log.logger.info("当前执行元数据刮削识别的根文件夹:{0}".format(self.dir_path))
for folder in os.listdir(self.dir_path):
__folder2 = os.path.join(self.dir_path, folder)
# 判断是否文件夹
if os.path.isdir(__folder2):
for nfo_file in os.listdir(__folder2):
__child_file_path = os.path.join(__folder2, nfo_file)
if ".nfo" in os.path.basename(__child_file_path):
__file_paths.append(__child_file_path)
elif os.path.isfile(__folder2):
__file_name = os.path.basename(__folder2)
if ".nfo" in __file_name:
__file_paths.append(__folder2)
for __file_path in __file_paths:
self.log.logger.info("开始处理元数据刮削识别:{0}".format(__file_path))
# __file_path = "example/神出鬼没 (2023) - 2160p.nfo"
__nfo_data = Analyze(file_path=__file_path).analyze()
for __actor in __nfo_data["actors"]:
self.log.logger.info("当前解析的演员信息: {0}".format(__actor))
if "tmdbid" in __actor.keys():
__tmdbid = __actor["tmdbid"]
__actor_name = __actor["name"]
__name = __actor_name[0].lower()
__full_actor_name = __actor_name + "-tmdb-" + __tmdbid
__path_dir = os.path.join(self.output, __name, __full_actor_name)
thread_lock.acquire()
if not os.path.exists(__path_dir):
os.makedirs(__path_dir)
thread_lock.release()
# 如果存在元数据则不再进行刮削
if "person.nfo" not in os.listdir(__path_dir):
Tmdb(log=self.log, tmdb_id=__tmdbid, actor_path=__path_dir, tmdb_token=self.tmdb_token,
language=self.language).create_actor_nfo()
else:
self.log.logger.info("当前路径已存在person.nfo文件, 跳过刮削:{0}".format(__path_dir))
# 如果存在海报则不再进行刮削
if "folder.jpg" not in os.listdir(__path_dir):
Tmdb(log=self.log, tmdb_id=__tmdbid, actor_path=__path_dir, tmdb_token=self.tmdb_token,
language=self.language).get_actor_image()
else:
self.log.logger.info("当前路径已存在folder.jpg文件, 跳过刮削:{0}".format(__path_dir))
# 移动完成刮削的nfo文件到complete文件夹
shutil.move(__file_path, "complete/")
self.log.logger.info("------------------- 结束获取演员元数据及海报:{0} -------------------".format(self.thread_id))

0 comments on commit 61506ff

Please sign in to comment.