Skip to content

Commit

Permalink
feat: integrate-seafile-ai-search
Browse files Browse the repository at this point in the history
  • Loading branch information
cir9no committed Jul 7, 2024
1 parent fe35425 commit 9caa367
Show file tree
Hide file tree
Showing 28 changed files with 3,194 additions and 3 deletions.
8 changes: 6 additions & 2 deletions app/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@
VirusScanner, Statistics, CountUserActivity, CountTrafficInfo, ContentScanner,\
WorkWinxinNoticeSender, FileUpdatesSender, RepoOldFileAutoDelScanner,\
DeletedFilesCountCleaner
from seafevents.semantic_search.semantic_search import sem_app

from seafevents.repo_metadata.index_master import RepoMetadataIndexMaster
from seafevents.repo_metadata.index_worker import RepoMetadataIndexWorker
from seafevents.seafevent_server.seafevent_server import SeafEventServer
from seafevents.app.config import ENABLE_METADATA_MANAGEMENT
from seafevents.app.config import ENABLE_METADATA_MANAGEMENT, ENABLE_SEAFILE_AI


class App(object):
Expand All @@ -25,7 +26,10 @@ def __init__(self, config, ccnet_config, seafile_config,
self._seafevent_server = SeafEventServer(self, config)

if self._bg_tasks_enabled:
self._index_updater = IndexUpdater(config)
if ENABLE_SEAFILE_AI:
self._index_updater = sem_app
else:
self._index_updater = IndexUpdater(config)
self._seahub_email_sender = SeahubEmailSender(config)
self._ldap_syncer = LdapSyncer(config, ccnet_config)
self._virus_scanner = VirusScanner(config, seafile_config)
Expand Down
1 change: 1 addition & 0 deletions app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
METADATA_SERVER_SECRET_KEY = getattr(seahub_settings, 'METADATA_SERVER_SECRET_KEY', '')
METADATA_SERVER_URL = getattr(seahub_settings, 'METADATA_SERVER_URL', '')
ENABLE_METADATA_MANAGEMENT = getattr(seahub_settings, 'ENABLE_METADATA_MANAGEMENT', False)
ENABLE_SEAFILE_AI = getattr(seahub_settings, 'ENABLE_SEAFILE_AI', False)
except ImportError:
logger.critical("Can not import seahub settings.")
raise RuntimeError("Can not import seahub settings.")
Expand Down
9 changes: 9 additions & 0 deletions mysql.sql
Original file line number Diff line number Diff line change
Expand Up @@ -213,3 +213,12 @@ CREATE TABLE IF NOT EXISTS `GroupIdLDAPUuidPair` (
UNIQUE KEY `group_id` (`group_id`),
UNIQUE KEY `group_uuid` (`group_uuid`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;

CREATE TABLE IF NOT EXISTS `index_repo` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`repo_id` varchar(36) NOT NULL,
`created_at` datetime(6) NOT NULL,
`updated` datetime(6),
PRIMARY KEY (`id`) USING BTREE,
UNIQUE KEY `repo_id`(`repo_id`)
) ENGINE = InnoDB DEFAULT CHARACTER SET = utf8;
2 changes: 1 addition & 1 deletion repo_data/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def create_engine_from_conf(config_file):
if seaf_conf.has_option('database', 'host'):
db_server = seaf_conf.get('database', 'host')
if seaf_conf.has_option('database', 'port'):
db_port =seaf_conf.getint('database', 'port')
db_port = seaf_conf.getint('database', 'port')
db_username = seaf_conf.get('database', 'user')
db_passwd = seaf_conf.get('database', 'password')
db_name = seaf_conf.get('database', 'db_name')
Expand Down
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,6 @@ pymysql
gevent==24.2.*
Flask==2.2.*
apscheduler
unstructured[docx,pptx]
transformers
ndjson
188 changes: 188 additions & 0 deletions seafevent_server/request_handler.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import jwt
import logging
import json

from flask import Flask, request, make_response
from seafevents.app.config import SEAHUB_SECRET_KEY
from seafevents.seafevent_server.task_manager import task_manager
from seafevents.semantic_search.sem_index_task.index_task_manager import index_task_manager
from seafevents.semantic_search.semantic_search import sem_app

app = Flask(__name__)
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -48,3 +51,188 @@ def add_init_metadata_task():
return make_response((e, 500))

return make_response(({'task_id': task_id}, 200))


@app.route('/library-sdoc-indexes', methods=['POST'])
def library_sdoc_indexes():
is_valid = check_auth_token(request)
if not is_valid:
return {'error_msg': 'Permission denied'}, 403

try:
data = json.loads(request.data)
except Exception as e:
logger.exception(e)
return {'error_msg': 'Bad request.'}, 400

repo_id = data.get('repo_id')

if not repo_id:
return {'error_msg': 'repo_id invalid.'}, 400

commit_id = sem_app.repo_data.get_repo_head_commit(repo_id)

if not commit_id:
return {'error_msg': 'repo invalid.'}, 400

try:
is_exist = sem_app.repo_file_index.check_index(repo_id)
except Exception as e:
logger.exception(e)
return {'error_msg': 'Internet server error.'}, 500

if is_exist:
return {'error_msg': 'Index exists.'}, 400

task = index_task_manager.get_pending_or_running_task(repo_id)

if task:
return {'task_id': task.id}, 200

try:
sem_app.index_manager.create_index_repo_db(repo_id)
except Exception as e:
logger.exception(e)
return {'error_msg': 'Internet server error.'}, 500

task_id = index_task_manager.add_library_sdoc_index_task(repo_id, commit_id)

return {'task_id': task_id}, 200


@app.route('/search', methods=['POST'])
def search():
is_valid = check_auth_token(request)
if not is_valid:
return {'error_msg': 'Permission denied'}, 403

try:
data = json.loads(request.data)
except Exception as e:
logger.exception(e)
return {'error_msg': 'Bad request.'}, 400

query = data.get('query').strip()
repos = data.get('repos')
suffixes = data.get('suffixes')
search_filename_only = data.get('search_filename_only')

if not query:
return {'error_msg': 'query invalid.'}, 400

if not repos:
return {'error_msg': 'repos invalid.'}, 400

try:
count = int(data.get('count'))
except:
count = 20

if search_filename_only:
results = index_task_manager.keyword_search(query, repos, count, suffixes)
else:
results = index_task_manager.hybrid_search(query, repos[0], count)

return {'results': results}, 200


@app.route('/library-sdoc-index', methods=['PUT', 'DELETE'])
def library_sdoc_index():
is_valid = check_auth_token(request)
if not is_valid:
return {'error_msg': 'Permission denied'}, 403

try:
data = json.loads(request.data)
except Exception as e:
logger.exception(e)
return {'error_msg': 'Bad request.'}, 400

repo_id = data.get('repo_id')

if not repo_id:
return {'error_msg': 'repo_id invalid'}, 400

try:
index_repo = sem_app.index_manager.get_index_repo_by_repo_id(repo_id)
except Exception as e:
logger.exception(e)
return {'error_msg': 'Internet server error.'}, 500

if request.method == 'DELETE':
if not index_repo:
return {'success': True}, 200

task = index_task_manager.get_pending_or_running_task(repo_id)

if task:
return {'error_msg': 'library sdoc index is running'}, 400

try:
sem_app.index_manager.delete_library_sdoc_index_by_repo_id(repo_id, sem_app.repo_file_index, sem_app.repo_status_index)
except Exception as e:
logger.exception(e)
return {'error_msg': 'Internet server error.'}, 500

return {'success': True}, 200

elif request.method == 'PUT':
commit_id = sem_app.repo_data.get_repo_head_commit(repo_id)

if not commit_id:
return {'error_msg': 'repo invalid.'}, 400

task = index_task_manager.get_pending_or_running_task(repo_id)

if task:
return {'task_id': task.id}, 200


try:
task_id = index_task_manager.add_update_a_library_sdoc_index_task(repo_id, commit_id)
except Exception as e:
logger.exception(e)
return {'error_msg': 'Internet server error.'}, 500

return {'task_id': task_id}, 200


@app.route('/task-status', methods=['GET'])
def query_task_status():
is_valid = check_auth_token(request)
if not is_valid:
return {'error_msg': 'Permission denied'}, 403

task_id = request.args.get('task_id')
if not task_id:
return {'error_msg': 'task_id invalid'}, 400

task = index_task_manager.query_task(task_id)
if not task:
return {'error_msg': 'Task not found'}, 404

return {'is_finished': task.is_finished()}


@app.route('/library-index-state', methods=['GET'])
def query_library_index_state():
is_valid = check_auth_token(request)
if not is_valid:
return {'error_msg': 'Permission denied'}, 403

repo_id = request.args.get('repo_id')
if not repo_id:
return {'error_msg': 'repo_id invalid'}, 400

try:
is_exist = sem_app.index_manager.get_index_repo_by_repo_id(repo_id)
except Exception as e:
logger.exception(e)
return {'error_msg': 'Internet server error.'}, 500

if not is_exist:
return {'state': 'uncreated', 'task_id': ''}

task = index_task_manager.get_pending_or_running_task(repo_id)

return task and {'state': 'running', 'task_id': task.id} or {'state': 'finished', 'task_id': ''}
96 changes: 96 additions & 0 deletions semantic_search/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import os
import sys
import logging

logger = logging.getLogger(__name__)


FILE_SERVER = ''

APP_NAME = 'semantic-search'

SECRET_KEY = ''
SEAFILE_SERVER = ''

# log
LOG_FILE = None
LOG_LEVEL = 'info'
ENABLE_SYS_LOG = False

# sections
## indexManager worker count
INDEX_MANAGER_WORKERS = 2
INDEX_TASK_EXPIRE_TIME = 30 * 60

RETRIEVAL_NUM = 20

# embedding dimension
DIMENSION = 768

MODEL_VOCAB_PATH = ''
FILE_SENTENCE_LIMIT = 1000

THRESHOLD = 0.01

## seasearch
SEASEARCH_SERVER = 'http://127.0.0.1:4080'
SEASEARCH_TOKEN = ''
VECTOR_M = 256
SHARD_NUM = 1

## sea-embedding
SEA_EMBEDDING_SERVER = ''
SEA_EMBEDDING_KEY = ''

EMBEDDING_API_TYPE = 'sea-embedding'

# seafile config database
MYSQL_HOST = ''
MYSQL_USER = ''
MYSQL_PASSWORD = ''
MYSQL_PORT = 3306
MYSQL_DB = 'seafile'
MYSQL_UNIX_SOCKET = ''

# repo file index support file types
SUPPORT_INDEX_FILE_TYPES = [
'.sdoc',
'.md',
'.markdown',
'.doc',
'.docx',
'.ppt',
'.pptx',
'.pdf',
]


CONF_DIR = '/opt/seafile/conf/'

try:
import seahub.settings as seahub_settings
SEA_EMBEDDING_SERVER = getattr(seahub_settings, 'SEA_EMBEDDING_SERVER', '')
SEA_EMBEDDING_KEY = getattr(seahub_settings, 'SEA_EMBEDDING_KEY', '')
SEASEARCH_SERVER = getattr(seahub_settings, 'SEASEARCH_SERVER', '')
SEASEARCH_TOKEN = getattr(seahub_settings, 'SEASEARCH_TOKEN', '')
MODEL_VOCAB_PATH = getattr(seahub_settings, 'MODEL_VOCAB_PATH', '')
MODEL_CACHE_DIR = getattr(seahub_settings, 'MODEL_CACHE_DIR', '')
INDEX_STORAGE_PATH = getattr(seahub_settings, 'INDEX_STORAGE_PATH', '')
except ImportError:
logger.critical("Can not import seahub settings.")
raise RuntimeError("Can not import seahub settings.")


try:

if os.path.exists('/data/dev/seafevents/semantic_search/semantic_search_settings.py'):
from seafevents.semantic_search.semantic_search_settings import *
except:
pass

try:
if os.path.exists(CONF_DIR):
sys.path.insert(0, CONF_DIR)
from seafile_ai_settings import *
except ImportError as e:
pass
Loading

0 comments on commit 9caa367

Please sign in to comment.