Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: integrate-seafile-ai-search #337

Open
wants to merge 4 commits into
base: 12.0
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion app/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
from seafevents.repo_metadata.index_master import RepoMetadataIndexMaster
from seafevents.repo_metadata.index_worker import RepoMetadataIndexWorker
from seafevents.seafevent_server.seafevent_server import SeafEventServer
from seafevents.app.config import ENABLE_METADATA_MANAGEMENT
from seafevents.app.config import ENABLE_METADATA_MANAGEMENT, ENABLE_SEAFILE_AI
from seafevents.semantic_search.index_task.filename_index_updater import RepoFilenameIndexUpdater


class App(object):
Expand Down Expand Up @@ -38,6 +39,8 @@ def __init__(self, config, ccnet_config, seafile_config,
if ENABLE_METADATA_MANAGEMENT:
self._index_master = RepoMetadataIndexMaster(config)
self._index_worker = RepoMetadataIndexWorker(config)
if ENABLE_SEAFILE_AI:
self._repo_filename_index_updater = RepoFilenameIndexUpdater(config)

def serve_forever(self):
if self._fg_tasks_enabled:
Expand All @@ -60,3 +63,5 @@ def serve_forever(self):
if ENABLE_METADATA_MANAGEMENT:
self._index_master.start()
self._index_worker.start()
if ENABLE_SEAFILE_AI:
self._repo_filename_index_updater.start()
1 change: 1 addition & 0 deletions app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
METADATA_SERVER_URL = getattr(seahub_settings, 'METADATA_SERVER_URL', '')
ENABLE_METADATA_MANAGEMENT = getattr(seahub_settings, 'ENABLE_METADATA_MANAGEMENT', False)
METADATA_FILE_TYPES = getattr(seahub_settings, 'METADATA_FILE_TYPES', {})
ENABLE_SEAFILE_AI = getattr(seahub_settings, 'ENABLE_SEAFILE_AI', False)
except ImportError:
logger.critical("Can not import seahub settings.")
raise RuntimeError("Can not import seahub settings.")
Expand Down
9 changes: 9 additions & 0 deletions mysql.sql
Original file line number Diff line number Diff line change
Expand Up @@ -213,3 +213,12 @@ CREATE TABLE IF NOT EXISTS `GroupIdLDAPUuidPair` (
UNIQUE KEY `group_id` (`group_id`),
UNIQUE KEY `group_uuid` (`group_uuid`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;

CREATE TABLE IF NOT EXISTS `index_repo` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`repo_id` varchar(36) NOT NULL,
`created_at` datetime(6) NOT NULL,
`updated` datetime(6),
PRIMARY KEY (`id`) USING BTREE,
UNIQUE KEY `repo_id`(`repo_id`)
) ENGINE = InnoDB DEFAULT CHARACTER SET = utf8;
2 changes: 1 addition & 1 deletion repo_data/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def create_engine_from_conf(config_file):
if seaf_conf.has_option('database', 'host'):
db_server = seaf_conf.get('database', 'host')
if seaf_conf.has_option('database', 'port'):
db_port =seaf_conf.getint('database', 'port')
db_port = seaf_conf.getint('database', 'port')
db_username = seaf_conf.get('database', 'user')
db_passwd = seaf_conf.get('database', 'password')
db_name = seaf_conf.get('database', 'db_name')
Expand Down
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,6 @@ pymysql
gevent==24.2.*
Flask==2.2.*
apscheduler
unstructured[docx,pptx]
transformers
JoinTyang marked this conversation as resolved.
Show resolved Hide resolved
ndjson
187 changes: 187 additions & 0 deletions seafevent_server/request_handler.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
import jwt
import logging
import json

from flask import Flask, request, make_response
from seafevents.app.config import SEAHUB_SECRET_KEY
from seafevents.seafevent_server.task_manager import task_manager
from seafevents.seafevent_server.export_task_manager import event_export_task_manager

from seafevents.semantic_search.index_task.index_task_manager import index_task_manager
from seafevents.repo_data import repo_data

app = Flask(__name__)
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -96,3 +99,187 @@ def query_status():
return make_response((error, 500))
return make_response(({'is_finished': is_finished}, 200))


@app.route('/library-sdoc-indexes', methods=['POST'])
def library_sdoc_indexes():
is_valid = check_auth_token(request)
if not is_valid:
return {'error_msg': 'Permission denied'}, 403

try:
data = json.loads(request.data)
except Exception as e:
logger.exception(e)
return {'error_msg': 'Bad request.'}, 400

repo_id = data.get('repo_id')

if not repo_id:
return {'error_msg': 'repo_id invalid.'}, 400

commit_id = repo_data.get_repo_head_commit(repo_id)

if not commit_id:
return {'error_msg': 'repo invalid.'}, 400

try:
is_exist = index_task_manager.repo_file_index.check_index(repo_id)
except Exception as e:
logger.exception(e)
return {'error_msg': 'Internet server error.'}, 500

if is_exist:
return {'error_msg': 'Index exists.'}, 400

task = index_task_manager.get_pending_or_running_task(repo_id)

if task:
return {'task_id': task.id}, 200

try:
index_task_manager.index_manager.create_index_repo_db(repo_id)
except Exception as e:
logger.exception(e)
return {'error_msg': 'Internet server error.'}, 500

task_id = index_task_manager.add_library_sdoc_index_task(repo_id, commit_id)

return {'task_id': task_id}, 200


@app.route('/search', methods=['POST'])
def search():
is_valid = check_auth_token(request)
if not is_valid:
return {'error_msg': 'Permission denied'}, 403

try:
data = json.loads(request.data)
except Exception as e:
logger.exception(e)
return {'error_msg': 'Bad request.'}, 400

query = data.get('query').strip()
repos = data.get('repos')
suffixes = data.get('suffixes')
search_filename_only = data.get('search_filename_only')

if not query:
return {'error_msg': 'query invalid.'}, 400

if not repos:
return {'error_msg': 'repos invalid.'}, 400

try:
count = int(data.get('count'))
except:
count = 20

if search_filename_only:
results = index_task_manager.keyword_search(query, repos, count, suffixes)
else:
results = index_task_manager.hybrid_search(query, repos[0], count)

return {'results': results}, 200


@app.route('/library-sdoc-index', methods=['PUT', 'DELETE'])
def library_sdoc_index():
is_valid = check_auth_token(request)
if not is_valid:
return {'error_msg': 'Permission denied'}, 403

try:
data = json.loads(request.data)
except Exception as e:
logger.exception(e)
return {'error_msg': 'Bad request.'}, 400

repo_id = data.get('repo_id')

if not repo_id:
return {'error_msg': 'repo_id invalid'}, 400

try:
index_repo = index_task_manager.index_manager.get_index_repo_by_repo_id(repo_id)
except Exception as e:
logger.exception(e)
return {'error_msg': 'Internet server error.'}, 500

if request.method == 'DELETE':
if not index_repo:
return {'success': True}, 200

task = index_task_manager.get_pending_or_running_task(repo_id)

if task:
return {'error_msg': 'library sdoc index is running'}, 400

try:
index_task_manager.index_manager.delete_library_sdoc_index_by_repo_id(repo_id, index_task_manager.repo_file_index, index_task_manager.repo_status_index)
except Exception as e:
logger.exception(e)
return {'error_msg': 'Internet server error.'}, 500

return {'success': True}, 200

elif request.method == 'PUT':
commit_id = repo_data.get_repo_head_commit(repo_id)

if not commit_id:
return {'error_msg': 'repo invalid.'}, 400

task = index_task_manager.get_pending_or_running_task(repo_id)

if task:
return {'task_id': task.id}, 200


try:
task_id = index_task_manager.add_update_a_library_sdoc_index_task(repo_id, commit_id)
except Exception as e:
logger.exception(e)
return {'error_msg': 'Internet server error.'}, 500

return {'task_id': task_id}, 200


@app.route('/task-status', methods=['GET'])
def query_task_status():
is_valid = check_auth_token(request)
if not is_valid:
return {'error_msg': 'Permission denied'}, 403

task_id = request.args.get('task_id')
if not task_id:
return {'error_msg': 'task_id invalid'}, 400

task = index_task_manager.query_task(task_id)
if not task:
return {'error_msg': 'Task not found'}, 404

return {'is_finished': task.is_finished()}


@app.route('/library-index-state', methods=['GET'])
def query_library_index_state():
is_valid = check_auth_token(request)
if not is_valid:
return {'error_msg': 'Permission denied'}, 403

repo_id = request.args.get('repo_id')
if not repo_id:
return {'error_msg': 'repo_id invalid'}, 400

try:
is_exist = index_task_manager.index_manager.get_index_repo_by_repo_id(repo_id)
except Exception as e:
logger.exception(e)
return {'error_msg': 'Internet server error.'}, 500

if not is_exist:
return {'state': 'uncreated', 'task_id': ''}

task = index_task_manager.get_pending_or_running_task(repo_id)

return task and {'state': 'running', 'task_id': task.id} or {'state': 'finished', 'task_id': ''}
8 changes: 8 additions & 0 deletions seafevent_server/seafevent_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from seafevents.seafevent_server.request_handler import app as application
from seafevents.seafevent_server.task_manager import task_manager
from seafevents.seafevent_server.export_task_manager import event_export_task_manager
from seafevents.semantic_search.index_task.index_task_manager import index_task_manager
from seafevents.app.config import ENABLE_SEAFILE_AI


class SeafEventServer(Thread):
Expand All @@ -17,6 +19,12 @@ def __init__(self, app, config):

task_manager.run()
event_export_task_manager.run()

if ENABLE_SEAFILE_AI:
# semantic search index task
index_task_manager.init(config)
index_task_manager.start()

self._server = WSGIServer((self._host, int(self._port)), application)

def _parse_config(self, config):
Expand Down
16 changes: 16 additions & 0 deletions semantic_search/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import os
import logging

from seafevents.app.config import get_config
logger = logging.getLogger(__name__)

try:
evtconf = os.environ['EVENTS_CONFIG_FILE']
conf = get_config(evtconf)
sem_conf = conf['SEMANTIC_SEARCH']

MODEL_VOCAB_PATH = sem_conf['embedding_model_vocab_path']
SUPPORT_INDEX_FILE_TYPES = sem_conf['suppport_index_file_types'].split(', ')

except Exception as e:
logger.warning(e)
Loading
Loading