Create instagram_crawler.py (#2509)

* Create instagram_crawler.py

* codespell --ignore-words-list=followings

* Update web_programming/instagram_crawler.py

Co-authored-by: Christian Clauss <cclauss@me.com>

* Update web_programming/instagram_crawler.py

Co-authored-by: Christian Clauss <cclauss@me.com>

* Update web_programming/instagram_crawler.py

Co-authored-by: Christian Clauss <cclauss@me.com>

* Update web_programming/instagram_crawler.py

Co-authored-by: Christian Clauss <cclauss@me.com>

* Update web_programming/instagram_crawler.py

Co-authored-by: Christian Clauss <cclauss@me.com>

* Update web_programming/instagram_crawler.py

Co-authored-by: Christian Clauss <cclauss@me.com>

* Update web_programming/instagram_crawler.py

Co-authored-by: Christian Clauss <cclauss@me.com>

* Update instagram_crawler.py

* Add doctests

* fixup! except (json.decoder.JSONDecodeError, KeyError):

* if getenv("CONTINUOUS_INTEGRATION"): return

* Update instagram_crawler.py

* Update web_programming/instagram_crawler.py

Co-authored-by: Dhruv <dhruvmanila@gmail.com>

* added fake_useragent

* Update instagram_crawler.py

* Comment out doctests

Co-authored-by: Christian Clauss <cclauss@me.com>
Co-authored-by: Dhruv <dhruvmanila@gmail.com>
This commit is contained in:
YOGESHWARAN R 2020-09-29 21:34:55 +05:30 committed by GitHub
parent e166350509
commit e6e2dc69d5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 141 additions and 1 deletions

View File

@ -11,7 +11,7 @@ jobs:
- run: pip install codespell flake8
- run: |
SKIP="./.*,./other/dictionary.txt,./other/words,./project_euler/problem_22/p022_names.txt"
codespell --ignore-words-list=ans,fo,hist,iff,secant,som,tim --skip=$SKIP --quiet-level=2
codespell --ignore-words-list=ans,fo,followings,hist,iff,secant,som,tim --skip=$SKIP --quiet-level=2
- name: Codespell comment
if: ${{ failure() }}
uses: plettich/python_codespell_action@master

View File

@ -0,0 +1,140 @@
#!/usr/bin/env python3
from __future__ import annotations
import json
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
headers = {"UserAgent": UserAgent().random}
def extract_user_profile(script) -> dict:
"""
May raise json.decoder.JSONDecodeError
"""
data = script.contents[0]
info = json.loads(data[data.find('{"config"'): -1])
return info["entry_data"]["ProfilePage"][0]["graphql"]["user"]
class InstagramUser:
"""
Class Instagram crawl instagram user information
Usage: (doctest failing on Travis CI)
# >>> instagram_user = InstagramUser("github")
# >>> instagram_user.is_verified
True
# >>> instagram_user.biography
'Built for developers.'
"""
def __init__(self, username):
self.url = f"https://www.instagram.com/{username}/"
self.user_data = self.get_json()
def get_json(self) -> dict:
"""
Return a dict of user information
"""
html = requests.get(self.url, headers=headers).text
scripts = BeautifulSoup(html, "html.parser").find_all("script")
try:
return extract_user_profile(scripts[4])
except (json.decoder.JSONDecodeError, KeyError):
return extract_user_profile(scripts[3])
def __repr__(self) -> str:
return f"{self.__class__.__name__}('{self.username}')"
def __str__(self) -> str:
return f"{self.fullname} ({self.username}) is {self.biography}"
@property
def username(self) -> str:
return self.user_data["username"]
@property
def fullname(self) -> str:
return self.user_data["full_name"]
@property
def biography(self) -> str:
return self.user_data["biography"]
@property
def email(self) -> str:
return self.user_data["business_email"]
@property
def website(self) -> str:
return self.user_data["external_url"]
@property
def number_of_followers(self) -> int:
return self.user_data["edge_followed_by"]["count"]
@property
def number_of_followings(self) -> int:
return self.user_data["edge_follow"]["count"]
@property
def number_of_posts(self) -> int:
return self.user_data["edge_owner_to_timeline_media"]["count"]
@property
def profile_picture_url(self) -> str:
return self.user_data["profile_pic_url_hd"]
@property
def is_verified(self) -> bool:
return self.user_data["is_verified"]
@property
def is_private(self) -> bool:
return self.user_data["is_private"]
def test_instagram_user(username: str = "github") -> None:
"""
A self running doctest
>>> test_instagram_user()
"""
from os import getenv
if getenv("CONTINUOUS_INTEGRATION"):
return # test failing on Travis CI
instagram_user = InstagramUser(username)
assert instagram_user.user_data
assert isinstance(instagram_user.user_data, dict)
assert instagram_user.username == username
if username != "github":
return
assert instagram_user.fullname == "GitHub"
assert instagram_user.biography == "Built for developers."
assert instagram_user.number_of_posts > 150
assert instagram_user.number_of_followers > 120000
assert instagram_user.number_of_followings > 15
assert instagram_user.email == "support@github.com"
assert instagram_user.website == "https://github.com/readme"
assert instagram_user.profile_picture_url.startswith("https://instagram.")
assert instagram_user.is_verified is True
assert instagram_user.is_private is False
if __name__ == "__main__":
import doctest
doctest.testmod()
instagram_user = InstagramUser("github")
print(instagram_user)
print(f"{instagram_user.number_of_posts = }")
print(f"{instagram_user.number_of_followers = }")
print(f"{instagram_user.number_of_followings = }")
print(f"{instagram_user.email = }")
print(f"{instagram_user.website = }")
print(f"{instagram_user.profile_picture_url = }")
print(f"{instagram_user.is_verified = }")
print(f"{instagram_user.is_private = }")