Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion docSite/content/zh-cn/docs/development/upgrading/4823.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,5 @@ weight: 802
1. 标签过滤时,子文件夹未成功过滤。
2. 暂时移除 md 阅读优化,避免链接分割错误。
3. 离开团队时,未刷新成员列表。
4. PPTX 编码错误,导致解析失败。
4. PPTX 编码错误,导致解析失败。
5. 删除知识库单条数据时,全文索引未跟随删除。
74 changes: 23 additions & 51 deletions packages/service/core/dataset/collection/controller.ts
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,13 @@ export const delCollectionRelatedSource = async ({
collections,
session
}: {
collections: DatasetCollectionSchemaType[];
collections: {
teamId: string;
fileId?: string;
metadata?: {
relatedImgId?: string;
};
}[];
session: ClientSession;
}) => {
if (collections.length === 0) return;
Expand Down Expand Up @@ -259,11 +265,13 @@ export const delCollectionRelatedSource = async ({
export async function delCollection({
collections,
session,
delRelatedSource
delImg = true,
delFile = true
}: {
collections: DatasetCollectionSchemaType[];
session: ClientSession;
delRelatedSource: boolean;
delImg: boolean;
delFile: boolean;
}) {
if (collections.length === 0) return;

Expand All @@ -281,9 +289,18 @@ export async function delCollection({
collectionId: { $in: collectionIds }
});

/* file and imgs */
if (delRelatedSource) {
await delCollectionRelatedSource({ collections, session });
if (delImg) {
await delImgByRelatedId({
teamId,
relateIds: collections.map((item) => item?.metadata?.relatedImgId || '').filter(Boolean),
session
});
}
if (delFile) {
await delFileByFileIdList({
bucketName: BucketNameEnum.dataset,
fileIdList: collections.map((item) => item?.fileId || '').filter(Boolean)
});
}

// Delete dataset_datas
Expand All @@ -309,48 +326,3 @@ export async function delCollection({
// no session delete: delete files, vector data
await deleteDatasetDataVector({ teamId, datasetIds, collectionIds });
}

/**
* delete delOnlyCollection
*/
export async function delOnlyCollection({
collections,
session
}: {
collections: DatasetCollectionSchemaType[];
session: ClientSession;
}) {
if (collections.length === 0) return;

const teamId = collections[0].teamId;

if (!teamId) return Promise.reject('teamId is not exist');

const datasetIds = Array.from(new Set(collections.map((item) => String(item.datasetId))));
const collectionIds = collections.map((item) => String(item._id));

// delete training data
await MongoDatasetTraining.deleteMany({
teamId,
datasetId: { $in: datasetIds },
collectionId: { $in: collectionIds }
});

// delete dataset.datas
await MongoDatasetData.deleteMany(
{ teamId, datasetId: { $in: datasetIds }, collectionId: { $in: collectionIds } },
{ session }
);

// delete collections
await MongoDatasetCollection.deleteMany(
{
teamId,
_id: { $in: collectionIds }
},
{ session }
);

// no session delete: delete files, vector data
await deleteDatasetDataVector({ teamId, datasetIds, collectionIds });
}
15 changes: 8 additions & 7 deletions packages/service/core/dataset/collection/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,14 @@ export const syncCollection = async (collection: CollectionWithDatasetType) => {
}

await mongoSessionRun(async (session) => {
// Delete old collection
await delCollection({
collections: [collection],
delImg: false,
delFile: false,
session
});

// Create new collection
await createCollectionAndInsertData({
session,
Expand Down Expand Up @@ -208,13 +216,6 @@ export const syncCollection = async (collection: CollectionWithDatasetType) => {
updateTime: new Date()
}
});

// Delete old collection
await delCollection({
collections: [collection],
delRelatedSource: false,
session
});
});

return DatasetCollectionSyncResultEnum.success;
Expand Down
4 changes: 2 additions & 2 deletions packages/service/core/dataset/data/dataTextSchema.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { connectionMongo, getMongoModel } from '../../../common/mongo';
const { Schema } = connectionMongo;
import { DatasetDataSchemaType } from '@fastgpt/global/core/dataset/type.d';
import { DatasetDataTextSchemaType } from '@fastgpt/global/core/dataset/type.d';
import { TeamCollectionName } from '@fastgpt/global/support/user/team/constant';
import { DatasetCollectionName } from '../schema';
import { DatasetColCollectionName } from '../collection/schema';
Expand Down Expand Up @@ -45,7 +45,7 @@ try {
console.log(error);
}

export const MongoDatasetDataText = getMongoModel<DatasetDataSchemaType>(
export const MongoDatasetDataText = getMongoModel<DatasetDataTextSchemaType>(
DatasetDataTextCollectionName,
DatasetDataTextSchema
);
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,6 @@ const ModelTest = ({ models, onClose }: { models: string[]; onClose: () => void
}
);

console.log(testModelList);
return (
<MyModal
iconSrc={'core/chat/sendLight'}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ const ChannelLog = ({ Tab }: { Tab: React.ReactNode }) => {
},
...res
];
}, [systemModelList]);
}, [systemModelList, t]);

const { data, isLoading, ScrollData } = useScrollPagination(getChannelLog, {
pageSize: 20,
Expand Down
2 changes: 1 addition & 1 deletion projects/app/src/pages/api/admin/clearInvalidData.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ async function checkInvalidImg(start: Date, end: Date, limit = 50) {
'metadata.relatedImgId': image.metadata?.relatedId
},
'_id'
);
).lean();

if (!collection) {
await image.deleteOne();
Expand Down
206 changes: 206 additions & 0 deletions projects/app/src/pages/api/admin/initv4823.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@fastgpt/service/common/response';
import { connectToDatabase } from '@/service/mongo';
import { authCert } from '@fastgpt/service/support/permission/auth/common';
import { addHours } from 'date-fns';
import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema';
import { MongoDataset } from '@fastgpt/service/core/dataset/schema';
import { delay, retryFn } from '@fastgpt/global/common/system/utils';
import { delCollection } from '@fastgpt/service/core/dataset/collection/controller';
import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun';
import { MongoDatasetDataText } from '@fastgpt/service/core/dataset/data/dataTextSchema';
import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema';
import { DatasetCollectionSchemaType } from '@fastgpt/global/core/dataset/type';
import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/schema';
import { deleteDatasetDataVector } from '@fastgpt/service/common/vectorStore/controller';

// 删了库,没删集合
const checkInvalidCollection = async () => {
const batchSize = 1000;

let skip = 0;
let success = 0;
while (true) {
try {
const collections = await MongoDatasetCollection.find(
{},
'_id teamId datasetId fileId metadata'
)
.limit(batchSize)
.skip(skip)
.lean();
if (collections.length === 0) break;

const datasetMap: Record<string, DatasetCollectionSchemaType[]> = {};

// 相同 datasetId 的集合放到一起
for await (const collection of collections) {
const datasetId = String(collection.datasetId);
const val = datasetMap[datasetId];
if (val) {
val.push(collection);
} else {
datasetMap[datasetId] = [collection];
}
}

const datasetIds = Object.keys(datasetMap);
for await (const datasetId of datasetIds) {
try {
const val = datasetMap[datasetId];
if (!val) {
continue;
}

await retryFn(async () => {
const datasetExists = await MongoDataset.findById(datasetId, '_id').lean();
if (!datasetExists) {
console.log('清理无效的知识库集合, datasetId', datasetId);
await mongoSessionRun(async (session) => {
return await delCollection({
collections: val,
delImg: true,
delFile: true,
session
});
});
}
});
} catch (error) {
console.log(error);
}
}

success += batchSize;
skip += batchSize;
console.log(`检测集合完成:${success}`);
} catch (error) {
console.log(error);
await delay(1000);
}
}
};

// 删了集合,没删 data
const checkInvalidData = async () => {
try {
const datas = (await MongoDatasetData.aggregate([
{
$group: {
_id: '$collectionId',
teamId: { $first: '$teamId' },
datasetId: { $first: '$datasetId' },
collectionId: { $first: '$collectionId' }
}
}
])) as {
_id: string;
teamId: string;
datasetId: string;
collectionId: string;
}[];
console.log('Total data collections length', datas.length);
// 批量获取集合
const collections = await MongoDatasetCollection.find({}, '_id').lean();
console.log('Total collection length', collections.length);
const collectionMap: Record<string, DatasetCollectionSchemaType> = {};
for await (const collection of collections) {
collectionMap[collection._id] = collection;
}
// 逐一删除无效的集合内容
for await (const data of datas) {
try {
const col = collectionMap[data.collectionId];
if (!col) {
console.log('清理无效的知识库集合内容, collectionId', data.collectionId);
await retryFn(async () => {
await MongoDatasetTraining.deleteMany({
teamId: data.teamId,
datasetId: data.datasetId,
collectionId: data.collectionId
});
await MongoDatasetDataText.deleteMany({
teamId: data.teamId,
datasetId: data.datasetId,
collectionId: data.collectionId
});
await deleteDatasetDataVector({
teamId: data.teamId,
datasetIds: [data.datasetId],
collectionIds: [data.collectionId]
});
await MongoDatasetData.deleteMany({
teamId: data.teamId,
datasetId: data.datasetId,
collectionId: data.collectionId
});
});
}
} catch (error) {
console.log(error);
}
}

console.log(`检测集合完成`);
} catch (error) {
console.log(error);
}
};

// 删了data,没删 data_text
const checkInvalidDataText = async () => {
try {
// 获取所有索引层的 dataId
const dataTexts = await MongoDatasetDataText.find({}, 'dataId').lean();
const dataIds = dataTexts.map((item) => String(item.dataId));
console.log('Total data_text dataIds:', dataIds.length);

// 获取数据层的 dataId
const datas = await MongoDatasetData.find({}, '_id').lean();
const datasSet = new Set(datas.map((item) => String(item._id)));
console.log('Total data length:', datas.length);

// 存在索引层,不存在数据层的 dataId,说明数据已经被删了
const unExistsSet = dataIds.filter((id) => !datasSet.has(id));
console.log('Total unExists dataIds:', unExistsSet.length);
await MongoDatasetDataText.deleteMany({
dataId: { $in: unExistsSet }
});
} catch (error) {}
};

/* pg 中的数据搬到 mongo dataset.datas 中,并做映射 */
export default async function handler(req: NextApiRequest, res: NextApiResponse) {
try {
await connectToDatabase();
await authCert({ req, authRoot: true });
const { start = -2, end = -360 * 24 } = req.body as { start: number; end: number };

(async () => {
try {
// 360天 ~ 2小时前
const endTime = addHours(new Date(), start);
const startTime = addHours(new Date(), end);
console.log('清理无效的集合');
await checkInvalidCollection();
console.log('清理无效的数据');
await checkInvalidData();
console.log('清理无效的data_text');
await checkInvalidDataText();
} catch (error) {
console.log('执行脏数据清理任务出错了');
}
})();

jsonRes(res, {
message: 'success'
});
} catch (error) {
console.log(error);

jsonRes(res, {
code: 500,
error
});
}
}
Loading
Loading