Skip to content

Commit

Permalink
Fix
Browse files Browse the repository at this point in the history
  • Loading branch information
grzegorzZ1 committed Nov 14, 2023
1 parent f33cc00 commit 5a8020f
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 8 deletions.
6 changes: 5 additions & 1 deletion csis_scraper/scrape/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -641,7 +641,11 @@ def traverse_papers(self, page_url, volume_no):
new_title = str(title).replace(" ", "_")
source_file = os.path.join(self.pdf_directory, pdf_id) + ".pdf"
target_file = os.path.join(self.pdf_directory, new_title) + ".pdf"
target_path_list = target_file.split("/")
if not re.search(r"volume_\d+", target_path_list[-2]):
target_path_list[-2] = target_path_list[-2] + "_" + target_path_list[-1]
target_path_list = target_path_list[:-1]
target_file = os.path.join(*target_path_list)
os.rename(source_file, target_file)

scraped_docs_list.append(scraped)
return scraped_docs_list
20 changes: 13 additions & 7 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import yaml
import shutil
import tqdm
import re


def tuple_type(strings):
Expand All @@ -34,6 +35,16 @@ def return_config(path):
return yaml.safe_load(file)


def prepare_ttl_path(ttl_dir, article):
ttl_path_split = os.path.join(ttl_dir, article.title).split("/")
if not re.search(r"volume_\d+", ttl_path_split[-2]):
ttl_path_split[-2] = ttl_path_split[-2] + "_" + ttl_path_split[-1]
ttl_path_split = ttl_path_split[:-1]
ttl_filename = os.path.join(*ttl_path_split) + ".ttl"
ttl_filename = ttl_filename.replace(" ", "_")
return ttl_filename


def scrape_scpe(scpe_issues_to_scrape):
print("Processing SCPE archives...")
logging.basicConfig(
Expand Down Expand Up @@ -66,8 +77,7 @@ def scrape_scpe(scpe_issues_to_scrape):
if not os.path.exists(ttl_dir):
os.makedirs(ttl_dir)

ttl_filename = os.path.join(ttl_dir, os.path.basename(article.title + ".ttl"))
ttl_filename = ttl_filename.replace(" ", "_")
ttl_filename = prepare_ttl_path(ttl_dir, article)

with open(ttl_filename, "w") as file:
file.write(g)
Expand Down Expand Up @@ -100,11 +110,7 @@ def scrape_csis(csis_volumes_to_scrape, args):
ttl_dir = f"./output/ttls/csis/volume_{article.volume}"
if not os.path.exists(ttl_dir):
os.mkdir(ttl_dir)

ttl_filename = os.path.join(
ttl_dir, os.path.basename(article.title + ".ttl")
)
ttl_filename = ttl_filename.replace(" ", "_")
ttl_filename = prepare_ttl_path(ttl_dir, article)

with open(ttl_filename, "w") as file:
file.write(g)
Expand Down

0 comments on commit 5a8020f

Please sign in to comment.