mirror of
https://hub.njuu.cf/TheAlgorithms/Python.git
synced 2023-10-11 13:06:12 +08:00
Create instagram_crawler.py (#2509)
* Create instagram_crawler.py * codespell --ignore-words-list=followings * Update web_programming/instagram_crawler.py Co-authored-by: Christian Clauss <cclauss@me.com> * Update web_programming/instagram_crawler.py Co-authored-by: Christian Clauss <cclauss@me.com> * Update web_programming/instagram_crawler.py Co-authored-by: Christian Clauss <cclauss@me.com> * Update web_programming/instagram_crawler.py Co-authored-by: Christian Clauss <cclauss@me.com> * Update web_programming/instagram_crawler.py Co-authored-by: Christian Clauss <cclauss@me.com> * Update web_programming/instagram_crawler.py Co-authored-by: Christian Clauss <cclauss@me.com> * Update web_programming/instagram_crawler.py Co-authored-by: Christian Clauss <cclauss@me.com> * Update instagram_crawler.py * Add doctests * fixup! except (json.decoder.JSONDecodeError, KeyError): * if getenv("CONTINUOUS_INTEGRATION"): return * Update instagram_crawler.py * Update web_programming/instagram_crawler.py Co-authored-by: Dhruv <dhruvmanila@gmail.com> * added fake_useragent * Update instagram_crawler.py * Comment out doctests Co-authored-by: Christian Clauss <cclauss@me.com> Co-authored-by: Dhruv <dhruvmanila@gmail.com>
This commit is contained in:
parent
e166350509
commit
e6e2dc69d5
2
.github/workflows/codespell.yml
vendored
2
.github/workflows/codespell.yml
vendored
@ -11,7 +11,7 @@ jobs:
|
||||
- run: pip install codespell flake8
|
||||
- run: |
|
||||
SKIP="./.*,./other/dictionary.txt,./other/words,./project_euler/problem_22/p022_names.txt"
|
||||
codespell --ignore-words-list=ans,fo,hist,iff,secant,som,tim --skip=$SKIP --quiet-level=2
|
||||
codespell --ignore-words-list=ans,fo,followings,hist,iff,secant,som,tim --skip=$SKIP --quiet-level=2
|
||||
- name: Codespell comment
|
||||
if: ${{ failure() }}
|
||||
uses: plettich/python_codespell_action@master
|
||||
|
140
web_programming/instagram_crawler.py
Normal file
140
web_programming/instagram_crawler.py
Normal file
@ -0,0 +1,140 @@
|
||||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from fake_useragent import UserAgent
|
||||
|
||||
headers = {"UserAgent": UserAgent().random}
|
||||
|
||||
|
||||
def extract_user_profile(script) -> dict:
|
||||
"""
|
||||
May raise json.decoder.JSONDecodeError
|
||||
"""
|
||||
data = script.contents[0]
|
||||
info = json.loads(data[data.find('{"config"'): -1])
|
||||
return info["entry_data"]["ProfilePage"][0]["graphql"]["user"]
|
||||
|
||||
|
||||
class InstagramUser:
|
||||
"""
|
||||
Class Instagram crawl instagram user information
|
||||
|
||||
Usage: (doctest failing on Travis CI)
|
||||
# >>> instagram_user = InstagramUser("github")
|
||||
# >>> instagram_user.is_verified
|
||||
True
|
||||
# >>> instagram_user.biography
|
||||
'Built for developers.'
|
||||
"""
|
||||
|
||||
def __init__(self, username):
|
||||
self.url = f"https://www.instagram.com/{username}/"
|
||||
self.user_data = self.get_json()
|
||||
|
||||
def get_json(self) -> dict:
|
||||
"""
|
||||
Return a dict of user information
|
||||
"""
|
||||
html = requests.get(self.url, headers=headers).text
|
||||
scripts = BeautifulSoup(html, "html.parser").find_all("script")
|
||||
try:
|
||||
return extract_user_profile(scripts[4])
|
||||
except (json.decoder.JSONDecodeError, KeyError):
|
||||
return extract_user_profile(scripts[3])
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"{self.__class__.__name__}('{self.username}')"
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"{self.fullname} ({self.username}) is {self.biography}"
|
||||
|
||||
@property
|
||||
def username(self) -> str:
|
||||
return self.user_data["username"]
|
||||
|
||||
@property
|
||||
def fullname(self) -> str:
|
||||
return self.user_data["full_name"]
|
||||
|
||||
@property
|
||||
def biography(self) -> str:
|
||||
return self.user_data["biography"]
|
||||
|
||||
@property
|
||||
def email(self) -> str:
|
||||
return self.user_data["business_email"]
|
||||
|
||||
@property
|
||||
def website(self) -> str:
|
||||
return self.user_data["external_url"]
|
||||
|
||||
@property
|
||||
def number_of_followers(self) -> int:
|
||||
return self.user_data["edge_followed_by"]["count"]
|
||||
|
||||
@property
|
||||
def number_of_followings(self) -> int:
|
||||
return self.user_data["edge_follow"]["count"]
|
||||
|
||||
@property
|
||||
def number_of_posts(self) -> int:
|
||||
return self.user_data["edge_owner_to_timeline_media"]["count"]
|
||||
|
||||
@property
|
||||
def profile_picture_url(self) -> str:
|
||||
return self.user_data["profile_pic_url_hd"]
|
||||
|
||||
@property
|
||||
def is_verified(self) -> bool:
|
||||
return self.user_data["is_verified"]
|
||||
|
||||
@property
|
||||
def is_private(self) -> bool:
|
||||
return self.user_data["is_private"]
|
||||
|
||||
|
||||
def test_instagram_user(username: str = "github") -> None:
|
||||
"""
|
||||
A self running doctest
|
||||
>>> test_instagram_user()
|
||||
"""
|
||||
from os import getenv
|
||||
|
||||
if getenv("CONTINUOUS_INTEGRATION"):
|
||||
return # test failing on Travis CI
|
||||
instagram_user = InstagramUser(username)
|
||||
assert instagram_user.user_data
|
||||
assert isinstance(instagram_user.user_data, dict)
|
||||
assert instagram_user.username == username
|
||||
if username != "github":
|
||||
return
|
||||
assert instagram_user.fullname == "GitHub"
|
||||
assert instagram_user.biography == "Built for developers."
|
||||
assert instagram_user.number_of_posts > 150
|
||||
assert instagram_user.number_of_followers > 120000
|
||||
assert instagram_user.number_of_followings > 15
|
||||
assert instagram_user.email == "support@github.com"
|
||||
assert instagram_user.website == "https://github.com/readme"
|
||||
assert instagram_user.profile_picture_url.startswith("https://instagram.")
|
||||
assert instagram_user.is_verified is True
|
||||
assert instagram_user.is_private is False
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
|
||||
doctest.testmod()
|
||||
instagram_user = InstagramUser("github")
|
||||
print(instagram_user)
|
||||
print(f"{instagram_user.number_of_posts = }")
|
||||
print(f"{instagram_user.number_of_followers = }")
|
||||
print(f"{instagram_user.number_of_followings = }")
|
||||
print(f"{instagram_user.email = }")
|
||||
print(f"{instagram_user.website = }")
|
||||
print(f"{instagram_user.profile_picture_url = }")
|
||||
print(f"{instagram_user.is_verified = }")
|
||||
print(f"{instagram_user.is_private = }")
|
Loading…
Reference in New Issue
Block a user