Skip to content

Commit

Permalink
Merge branch 'Feature/#959' into Feature/#1036
Browse files Browse the repository at this point in the history
  • Loading branch information
vkehfdl1 committed Dec 8, 2024
2 parents d4ab9ca + 9f27257 commit bb8b0d7
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 5 deletions.
37 changes: 37 additions & 0 deletions api/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,6 +421,41 @@ async def get_parse_documents(project_id):
return jsonify(result_dict_list), 200


@app.route("/projects/<project_id>/parse/<parsed_name>", methods=["GET"])
@project_exists(WORK_DIR)
async def get_parsed_file(project_id: str, parsed_name: str):
parsed_folder = os.path.join(WORK_DIR, project_id, "parse")
raw_df = pd.read_parquet(
os.path.join(parsed_folder, parsed_name, "0.parquet"), engine="pyarrow"
)
requested_filename = request.args.get("filename", type=str)
requested_page = request.args.get("page", -1, type=int)

if requested_filename is None:
return jsonify({"error": "Filename is required"}), 400

if requested_page < -1:
return jsonify({"error": "Invalid page number"}), 400

requested_filepath = os.path.join(
WORK_DIR, project_id, "raw_data", requested_filename
)

raw_row = raw_df.loc[raw_df["path"] == requested_filepath].loc[
raw_df["page"] == requested_page
]
if len(raw_row) <= 0:
raw_row = raw_df.loc[raw_df["path"] == requested_filepath].loc[
raw_df["page"] == -1
]
if len(raw_row) <= 0:
return jsonify({"error": "No matching document found"}), 404

result_dict = raw_row.iloc[0].to_dict()

return jsonify(result_dict), 200


@app.route("/projects/<project_id>/chunk", methods=["GET"])
@project_exists(WORK_DIR)
async def get_chunk_documents(project_id):
Expand Down Expand Up @@ -465,6 +500,7 @@ async def parse_documents_endpoint(project_id):
config = data["config"]
target_extension = data["extension"]
parse_name = data["name"]
all_files: bool = data.get("all_files", True)

parse_dir = os.path.join(WORK_DIR, project_id, "parse")

Expand All @@ -476,6 +512,7 @@ async def parse_documents_endpoint(project_id):
config_str=yaml.dump(config),
parse_name=parse_name,
glob_path=f"*.{target_extension}",
all_files=all_files,
)
task_id = task.id
return jsonify({"task_id": task_id, "status": "started"})
Expand Down
4 changes: 2 additions & 2 deletions api/src/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,13 @@
from src.schema import QACreationRequest


def run_parser_start_parsing(data_path_glob, project_dir, yaml_path):
def run_parser_start_parsing(data_path_glob, project_dir, yaml_path, all_files: bool):
# Import Parser here if it's defined in another module
parser = Parser(data_path_glob=data_path_glob, project_dir=project_dir)
print(
f"Parser started with data_path_glob: {data_path_glob}, project_dir: {project_dir}, yaml_path: {yaml_path}"
)
parser.start_parsing(yaml_path, all_files=True)
parser.start_parsing(yaml_path, all_files=all_files)
print("Parser completed")


Expand Down
11 changes: 9 additions & 2 deletions api/tasks/trial_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,12 @@ def generate_qa_documents(self, project_id: str, request_data: Dict[str, Any]):

@shared_task(bind=True, base=TrialTask)
def parse_documents(
self, project_id: str, config_str: str, parse_name: str, glob_path: str = "*.*"
self,
project_id: str,
config_str: str,
parse_name: str,
glob_path: str = "*.*",
all_files: bool = True,
):
load_dotenv(ENV_FILEPATH)
try:
Expand Down Expand Up @@ -247,7 +252,9 @@ def parse_documents(
with open(yaml_path, "w", encoding="utf-8") as f:
yaml.safe_dump(config_dict, f, allow_unicode=True)

result = run_parser_start_parsing(raw_data_path, parsed_data_path, yaml_path)
result = run_parser_start_parsing(
raw_data_path, parsed_data_path, yaml_path, all_files
)

self.update_state_and_db(
trial_id="",
Expand Down

0 comments on commit bb8b0d7

Please sign in to comment.