Skip to content
This repository has been archived by the owner on Jan 11, 2022. It is now read-only.

Added Python Package #9

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 107 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/

.vscode
.DS_Store
5 changes: 5 additions & 0 deletions instagram_scraper/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .instagram_scraper import main
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need main here?


__name__ = "Instagram Scraper"
__author__ = "Meet Mangukiya <[email protected]>"
__version__ = '1.0.0'
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not a stable API yet, 0.1.0 please.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Import scrape_instagram_tag, and scrape_instagram here... from instagram_scraper.instagram_scraper import ... is too long.

14 changes: 10 additions & 4 deletions instagram_scraper.py → instagram_scraper/instagram_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from requests_html import HTMLSession



# Source: http://blog.jstassen.com/2016/03/code-regex-for-instagram-username-and-hashtags/
REGEXES = {
'hashtag': re.compile('(?:#)([A-Za-z0-9_](?:(?:[A-Za-z0-9_]|(?:\.(?!\.))){0,28}(?:[A-Za-z0-9_]))?)'),
Expand Down Expand Up @@ -61,13 +60,14 @@ def scrape_instagram(tags: List[str], total_count: int=50, existing: set=None):
for tag in tags:
yield from scrape_instagram_tag(tag, total_count)


def main(tags, total_count, should_continue):
def _single_tag_processing(tag, total_count, existing_links, start):
os.makedirs(f'data/{tag}', exist_ok=True)
with open(f'data/{tag}/data.csv', 'a' if existing_links else 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile, delimiter=',')
for count, (url, caption, hashtags, mentions) in enumerate(scrape_instagram_tag(
tag, total_count, existing_links), start):
tag, total_count, existing_links), start):

try:
req = requests.get(url)
Expand All @@ -83,7 +83,8 @@ def _single_tag_processing(tag, total_count, existing_links, start):
', '.join(hashtags),
', '.join(mentions)
])
print(f'[{tag}] downloaded {url} as {count}.jpg in data/{tag}')
print(
f'[{tag}] downloaded {url} as {count}.jpg in data/{tag}')

for tag in tags:
existing_links = set()
Expand All @@ -96,7 +97,8 @@ def _single_tag_processing(tag, total_count, existing_links, start):
start = i + 1
_single_tag_processing(tag, total_count, existing_links, start)

if __name__ == '__main__':

def run():
parser = argparse.ArgumentParser()
parser.add_argument('--tags', '-t', nargs='+',
help='Tags to scrape images from')
Expand All @@ -111,3 +113,7 @@ def _single_tag_processing(tag, total_count, existing_links, start):
assert args.tags, "Enter tags to scrape! Use --tags option, see help."
assert args.count, "Enter total number of images to scrape using --count option, see help."
main(args.tags, args.count, args.cont)


if __name__ == '__main__':
run()
28 changes: 28 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import setuptools
import instagram_scraper

with open("README.md", "r") as fh:
long_description = fh.read()

setuptools.setup(
name=instagram_scraper.__name__,
version=instagram_scraper.__version__,
author=instagram_scraper.__author__,
author_email="[email protected]",
description="Scrape the Instagram frontend",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/meetmangukiya/instagram-scraper",
packages=setuptools.find_packages(),
install_requires=['requests-html'],
classifiers=(
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
),
entry_points={
'console_scripts': [
'scrape-insta = instagram_scraper.instagram_scraper:run',
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

prefer scrape-instagram

],
}
)