Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Yan/20240925 modify request headers #432

Open
wants to merge 16 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions .github/workflows/notify_on_pr.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
name: Notify on New PR

on:
pull_request:
types: [opened, reopened]

jobs:
notify:
runs-on: ubuntu-latest

steps:
- name: Send Notification
run: |
curl -X POST https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=3f8ffb13-152b-466c-b6b2-db7f0e3c6680 \
-H "Content-Type: application/json" \
-d '{
"msgtype": "text",
"text": {
"content": "【PR已创建】\n\n标题:${{ github.event.pull_request.title }}\n\n作者:${{ github.event.pull_request.user.login }}\n\n链接:${{ github.event.pull_request.html_url }}"
}
}'
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,10 @@ build
.DS_Store
.idea/
venv/

# Cookies
.linkedin_api

# Output
output
/*.json
144 changes: 144 additions & 0 deletions file_system/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@

import gzip
from os import makedirs
from os.path import isfile, exists
from file_system.downloader import download_file as static_download_file
from .finder import IGNORED_FILE_SET, get_file_paths, get_folder_paths, refine_path as static_refine_path
from .reader import DEFAULT_ENCODING, DEFAULT_EXCEL_SHEET_NAME, DEFAULT_VERBOSE, read_text, read_csv, read_excel, read_html, read_json
from .writer import DEFAULT_OVERWRITE, write_csv, write_excel, write_html, write_json, write_text


class FileSystem:
def __init__(self, path=None, verbose=DEFAULT_VERBOSE, encoding=DEFAULT_ENCODING):
self.path = path
self.verbose = verbose
self.encoding = encoding

def __get_path(self, path):
path = path if path is not None else self.path

if not path:
raise Exception('Path must be specified')

return path

def __prepare_folder(self, path):
# Auto create folder if it doesn't exist
folder_path = '/'.join(path.split('/')[:-1])
if not exists(folder_path):
makedirs(folder_path)

def read(self, path=None, verbose=None, encoding=None, excel_sheet_name=DEFAULT_EXCEL_SHEET_NAME):
path = self.__get_path(path)
verbose = verbose if verbose is not None else self.verbose
encoding = encoding if encoding is not None else self.encoding

if isfile(path):
if path[-5:] == '.json':
return read_json(file_path=path, verbose=verbose, encoding=encoding)
elif path[-5:] == '.html':
return read_html(file_path=path, verbose=verbose, encoding=encoding)
elif path[-4:] == '.txt' or path[-4:] == '.log':
return read_text(file_path=path, verbose=verbose, encoding=encoding)
elif path[-4:] == '.csv':
return read_csv(file_path=path, verbose=verbose, encoding=encoding)
elif path[-5:] == '.xlsx':
return read_excel(file_path=path, sheet_name=excel_sheet_name, verbose=verbose)
else:
raise Exception(f"Unsupported file type for {path}")

def write(self, data, path=None, verbose=None, csv_and_excel_overwrite=DEFAULT_OVERWRITE, encoding=None, csv_fieldnames=None, excel_sheet_name=DEFAULT_EXCEL_SHEET_NAME, excel_enable_str_conversion=False):
path = self.__get_path(path)
verbose = verbose if verbose is not None else self.verbose
encoding = encoding if encoding is not None else self.encoding

self.__prepare_folder(path)

if path[-5:] == '.json':
return write_json(file_path=path, data=data, verbose=verbose, encoding=encoding)
elif path[-5:] == '.html':
return write_html(file_path=path, data=data, verbose=verbose, encoding=encoding)
elif path[-4:] == '.txt' or path[-4:] == '.log':
return write_text(file_path=path, data=data, verbose=verbose, encoding=encoding)
elif path[-4:] == '.csv':
return write_csv(file_path=path, data=data, fieldnames=csv_fieldnames, verbose=verbose, overwrite=csv_and_excel_overwrite, encoding=encoding)
elif path[-5:] == '.xlsx':
return write_excel(file_path=path, data=data, sheet_name=excel_sheet_name, verbose=verbose, overwrite=csv_and_excel_overwrite, enable_str_conversion=excel_enable_str_conversion)
else:
raise Exception(f"Unsupported file type for {path}")

def unzip(self, path=None, output_path=None, verbose=None):
path = self.__get_path(path)
verbose = verbose if verbose is not None else self.verbose

self.__prepare_folder(path)

if path[-3:] == '.gz':
with gzip.open(path, 'rb') as gz_file:
data = gz_file.read()

if output_path:
with open(output_path, 'wb') as out_file:
out_file.write(data)

if verbose:
print(
f"Unzipped data from {path} and wrote data to {output_path}")
else:
if verbose:
print(f"Unzipped data from {path} ")

return data
else:
raise Exception(f"Unsupported file type for {path}")

def download(self, url, file_path_without_extension=None, verbose=None):
path = self.__get_path(file_path_without_extension)
verbose = verbose if verbose is not None else self.verbose

self.__prepare_folder(path)

return static_download_file(url=url, file_path_without_extension=path, verbose=verbose)

def find_files(self, path=None, ignored_file_set=IGNORED_FILE_SET, file_suffix=None, recursive=False):
path = path if path is not None else self.path
file_paths = get_file_paths(path, ignored_file_set, recursive)
if file_suffix:
if file_suffix[0] != '.':
file_suffix = '.' + file_suffix

return [file_path for file_path in file_paths if file_path[-len(file_suffix):] == file_suffix]

return file_paths

def find_folders(self, path=None, ignored_folder_set=set(), recursive=False):
path = path if path is not None else self.path
return get_folder_paths(path, ignored_folder_set, recursive)

def refine_path(self, path=None):
path = path if path is not None else self.path
return static_refine_path(path)


def read_file(path=None, verbose=None, encoding=None, excel_sheet_name=DEFAULT_EXCEL_SHEET_NAME):
return FileSystem().read(path=path, verbose=verbose, encoding=encoding, excel_sheet_name=excel_sheet_name)


def write_file(data, path=None, verbose=None, csv_and_excel_overwrite=DEFAULT_OVERWRITE, encoding=None, csv_fieldnames=None, excel_sheet_name=DEFAULT_EXCEL_SHEET_NAME, excel_enable_str_conversion=False):
return FileSystem().write(data=data, path=path, verbose=verbose, csv_and_excel_overwrite=csv_and_excel_overwrite, encoding=encoding, csv_fieldnames=csv_fieldnames, excel_sheet_name=excel_sheet_name, excel_enable_str_conversion=excel_enable_str_conversion)


def download_file(url, file_path_without_extension=None, verbose=None):
return FileSystem().download(url=url, file_path_without_extension=file_path_without_extension, verbose=verbose)


def unzip_file(path=None, output_path=None, verbose=None):
return FileSystem().unzip(path=path, output_path=output_path, verbose=verbose)


def find_files(path=None, ignored_file_set=IGNORED_FILE_SET, file_suffix=None, recursive=False):
return FileSystem().find_files(path=path, ignored_file_set=ignored_file_set, file_suffix=file_suffix, recursive=recursive)


def find_folders(path=None, ignored_folder_set=set(), recursive=False):
return FileSystem().find_folders(path=path, ignored_folder_set=ignored_folder_set, recursive=recursive)
43 changes: 43 additions & 0 deletions file_system/downloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import mimetypes
import logging
import os
import requests

from file_system.utils import get_or_create_folder_path

DEFAULT_VERBOSE = False


def download_file(url, file_path_without_extension, verbose=DEFAULT_VERBOSE):
# Send a GET request
response = requests.get(url, stream=True)
response.raise_for_status() # Ensure the request was successful

# Try to extract the file extension from the Content-Type header
content_type = response.headers.get('Content-Type')
extension = mimetypes.guess_extension(
content_type) if content_type else None

# If we couldn't find an extension, default to '.bin'
if not extension:
extension = '.bin'

if verbose:
logging.info(f"Guessed the file extension as {extension}")

get_or_create_folder_path(file_path_without_extension)

file_path = f"{file_path_without_extension}{extension}"

if verbose:
logging.info(f"Downloading file from {url}")

# Write the response content to a file
with open(file_path, 'wb') as file:
for chunk in response.iter_content(chunk_size=8192):
file.write(chunk)

if verbose:
logging.info(f"Downloaded file to {file_path}")

return file_path
56 changes: 56 additions & 0 deletions file_system/finder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@

import pathlib
import platform
from os import listdir, path
from posixpath import join

IGNORED_FILE_SET = set(['.DS_Store'])


def refine_path(path: str):
return str(pathlib.WindowsPath(path) if platform.system() == 'Windows' else pathlib.PurePath(path))


def get_file_paths(folder_path: str, ignored_file_set=IGNORED_FILE_SET, recursive=False):
if not path.exists(folder_path):
return []

if not recursive:
return [join(folder_path, file_name) for file_name in listdir(folder_path) if path.isfile(join(folder_path, file_name)) and file_name not in ignored_file_set]
else:
def internal_get_file_paths(folder_path: str, ignored_file_set=set()):
file_paths = []
for item in listdir(folder_path):
full_path = path.join(folder_path, item)
if path.isfile(full_path) and item not in ignored_file_set:
# Add the file path if it's a file and not ignored
file_paths.append(full_path)
elif path.isdir(full_path):
# Recursively search in subdirectories for files
file_paths.extend(internal_get_file_paths(
full_path, ignored_file_set))
return file_paths

return internal_get_file_paths(folder_path, ignored_file_set)


def get_folder_paths(folder_path: str, ignored_folder_set=set(), recursive=False):
if not path.exists(folder_path):
return []

if not recursive:
return [join(folder_path, folder_name) for folder_name in listdir(folder_path) if path.isdir(join(folder_path, folder_name)) and folder_name not in ignored_folder_set]
else:
def internal_get_folder_paths(folder_path: str, ignored_folder_set=set()):
folder_paths = []
for folder_name in listdir(folder_path):
full_path = path.join(folder_path, folder_name)
if path.isdir(full_path) and folder_name not in ignored_folder_set:
# Add the current folder path
folder_paths.append(full_path)
# Recursively add paths from subfolders
folder_paths.extend(internal_get_folder_paths(
full_path, ignored_folder_set))
return folder_paths

return internal_get_folder_paths(folder_path, ignored_folder_set)
77 changes: 77 additions & 0 deletions file_system/reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import os
import csv
import json
import logging
import pandas as pd

DEFAULT_ENCODING = 'utf-8'
DEFAULT_VERBOSE = False
DEFAULT_EXCEL_SHEET_NAME = 'Sheet1'


def read_json(file_path: str, verbose=DEFAULT_VERBOSE, encoding=DEFAULT_ENCODING):
data = None

if os.path.isfile(file_path):
with open(file_path, encoding=encoding) as jsonFile:
data = json.load(jsonFile)

if verbose:
logging.info(f"Read JSON data from {file_path}")

return data


def read_html(file_path: str, verbose=DEFAULT_VERBOSE, encoding=DEFAULT_ENCODING):
data = None

if os.path.isfile(file_path):
with open(file_path, encoding=encoding) as file:
data = ''.join(file.readlines())

if verbose:
logging.info(f"Read HTML data from {file_path}")

return data


def read_text(file_path: str, verbose=DEFAULT_VERBOSE, encoding=DEFAULT_ENCODING):
data = None

if os.path.isfile(file_path):
with open(file_path, encoding=encoding) as file:
data = ''.join(file.readlines())

if verbose:
logging.info(f"Read TXT data from {file_path}")

return data


def read_csv(file_path: str, verbose=DEFAULT_VERBOSE, encoding=DEFAULT_ENCODING) -> list[dict] | None:
data = None

if os.path.isfile(file_path):
with open(file_path, newline='', encoding=encoding) as csvfile:
spamreader = csv.DictReader(csvfile, delimiter=',', quotechar='"')
data = [row for row in spamreader]

if verbose:
logging.info(
f"Read CSV data from {file_path} with {len(data)} rows")

return data


def read_excel(file_path: str, sheet_name: str = DEFAULT_EXCEL_SHEET_NAME, verbose=DEFAULT_VERBOSE) -> list[dict] | None:
data = None

if os.path.isfile(file_path):
df = pd.read_excel(file_path, sheet_name=sheet_name)
data = df.to_dict(orient='records')

if verbose:
logging.info(
f"Read EXCEL data from {file_path} with {len(data)} rows")

return data
16 changes: 16 additions & 0 deletions file_system/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from datetime import datetime
from os import makedirs, path


def get_date_string():
return str(datetime.now()).replace(':', '_').replace(' ', '_')


def get_folder_path(full_path):
return path.dirname(full_path)


def get_or_create_folder_path(full_path):
folder_path = get_folder_path(full_path)
if not path.exists(folder_path):
makedirs(folder_path)
Loading