forked from Yash-Garg/Nyaa-Api-Ts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnyaa_scraper.py
112 lines (92 loc) · 4.91 KB
/
nyaa_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import requests
from bs4 import BeautifulSoup
import json
import sys
import re
class Constants:
NyaaBaseUrl = "https://nyaa.si"
DefaultProfilePic = "https://i.imgur.com/TKkz0qM.png"
def sanitize_filename(name):
"""Remove disallowed characters from the filename."""
# Define the allowed characters regex (CJK characters, alphanumeric, and specific symbols)
allowed_chars = r'[^a-zA-Z0-9\(\)\[\]\.\,\-\_一-龥ぁ-ゔァ-ヴー々〆〤\u4E00-\u9FFF]'
# Replace disallowed characters with underscores
sanitized_name = re.sub(allowed_chars, '_', name)
# Limit the filename length to avoid OS-specific issues
return sanitized_name[:255]
def fetch_torrent_info(torrent_id):
url = f"{Constants.NyaaBaseUrl}/view/{torrent_id}"
response = requests.get(url)
if response.status_code != 200:
print(f"Skipped {torrent_id}")
return False
soup = BeautifulSoup(response.text, 'html.parser')
panel = soup.find('div', class_='panel panel-default')
if not panel:
print(f"Skipped {torrent_id} - No panel found.")
return False
rows = panel.find_all('div', class_='row')
try:
title = panel.find('h3', class_='panel-title').text.strip() if panel.find('h3', class_='panel-title') else "Unknown Title"
torrent_data = {
"id": int(torrent_id),
"title": title,
"file": Constants.NyaaBaseUrl + panel.find('a', href=lambda x: x and x.endswith('.torrent'))['href'] if panel.find('a', href=lambda x: x and x.endswith('.torrent')) else "Unknown File URL",
"magnet": panel.find('a', href=lambda x: x and x.startswith('magnet:'))['href'] if panel.find('a', href=lambda x: x and x.startswith('magnet:')) else "Unknown Magnet URL",
"size": rows[3].find_all('div', class_='col-md-5')[0].text.strip() if len(rows[3].find_all('div', class_='col-md-5')) > 0 else "Unknown Size",
"category": rows[0].find_all('div', class_='col-md-5')[0].text.strip() if len(rows[0].find_all('div', class_='col-md-5')) > 0 else "Unknown Category",
"uploaded": rows[0].find_all('div', class_='col-md-5')[1].text.strip() if len(rows[0].find_all('div', class_='col-md-5')) > 1 else "Unknown Upload Date",
"seeders": int(rows[1].find_all('div', class_='col-md-5')[1].text.strip()) if len(rows[1].find_all('div', class_='col-md-5')) > 1 else 0,
"leechers": int(rows[2].find_all('div', class_='col-md-5')[1].text.strip()) if len(rows[2].find_all('div', class_='col-md-5')) > 1 else 0,
"completed": int(rows[3].find_all('div', class_='col-md-5')[1].text.strip()) if len(rows[3].find_all('div', class_='col-md-5')) > 1 else 0,
"info_hash": rows[4].find('div', class_='col-md-5').text.strip() if rows[4].find('div', class_='col-md-5') else "Unknown Info Hash"
}
except IndexError as e:
print(f"Skipped {torrent_id} - Error parsing data: {e}")
return False
description = soup.find('div', id='torrent-description').text.strip() if soup.find('div', id='torrent-description') else "No description available"
comments_section = soup.find('div', id='comments')
if comments_section:
try:
comment_count = int(comments_section.find('h3', class_='panel-title').text.split('-')[-1].strip())
except (AttributeError, ValueError):
comment_count = 0
else:
comment_count = 0
comments = []
if comment_count > 0:
comment_panels = comments_section.find_all('div', class_='comment-panel')
for panel in comment_panels:
body = panel.find('div', class_='panel-body')
if body:
comment = {
"name": body.find('a').text.strip() if body.find('a') else "Unknown User",
"content": body.find('div', class_='comment-body').find('div', class_='comment-content').text.strip() if body.find('div', class_='comment-body') and body.find('div', class_='comment-body').find('div', class_='comment-content') else "No content"
}
comments.append(comment)
file_info = {
"torrent": torrent_data,
"description": description,
"commentInfo": {
"count": comment_count,
"comments": comments
}
}
# Sanitize the title for a valid filename
filename = sanitize_filename(title) + ".json"
with open(filename, "w", encoding='utf-8') as f:
json.dump(file_info, f, ensure_ascii=False, indent=4)
print(f"Fetched {torrent_id}")
return True
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python nyaa_scraper.py <id_start-id_end>")
sys.exit(1)
id_range = sys.argv[1]
try:
start_id, end_id = map(int, id_range.split('-'))
except ValueError:
print("Invalid range format. Use <id_start-id_end>")
sys.exit(1)
for torrent_id in range(start_id, end_id + 1):
fetch_torrent_info(torrent_id)