forked from TheAlgorithms/Python
-
Notifications
You must be signed in to change notification settings - Fork 0
/
instagram_crawler.py
140 lines (111 loc) · 4.05 KB
/
instagram_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/usr/bin/env python3
from __future__ import annotations
import json
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
headers = {"UserAgent": UserAgent().random}
def extract_user_profile(script) -> dict:
"""
May raise json.decoder.JSONDecodeError
"""
data = script.contents[0]
info = json.loads(data[data.find('{"config"') : -1])
return info["entry_data"]["ProfilePage"][0]["graphql"]["user"]
class InstagramUser:
"""
Class Instagram crawl instagram user information
Usage: (doctest failing on GitHub Actions)
# >>> instagram_user = InstagramUser("github")
# >>> instagram_user.is_verified
True
# >>> instagram_user.biography
'Built for developers.'
"""
def __init__(self, username):
self.url = f"https://www.instagram.com/{username}/"
self.user_data = self.get_json()
def get_json(self) -> dict:
"""
Return a dict of user information
"""
html = requests.get(self.url, headers=headers, timeout=10).text
scripts = BeautifulSoup(html, "html.parser").find_all("script")
try:
return extract_user_profile(scripts[4])
except (json.decoder.JSONDecodeError, KeyError):
return extract_user_profile(scripts[3])
def __repr__(self) -> str:
return f"{self.__class__.__name__}('{self.username}')"
def __str__(self) -> str:
return f"{self.fullname} ({self.username}) is {self.biography}"
@property
def username(self) -> str:
return self.user_data["username"]
@property
def fullname(self) -> str:
return self.user_data["full_name"]
@property
def biography(self) -> str:
return self.user_data["biography"]
@property
def email(self) -> str:
return self.user_data["business_email"]
@property
def website(self) -> str:
return self.user_data["external_url"]
@property
def number_of_followers(self) -> int:
return self.user_data["edge_followed_by"]["count"]
@property
def number_of_followings(self) -> int:
return self.user_data["edge_follow"]["count"]
@property
def number_of_posts(self) -> int:
return self.user_data["edge_owner_to_timeline_media"]["count"]
@property
def profile_picture_url(self) -> str:
return self.user_data["profile_pic_url_hd"]
@property
def is_verified(self) -> bool:
return self.user_data["is_verified"]
@property
def is_private(self) -> bool:
return self.user_data["is_private"]
def test_instagram_user(username: str = "github") -> None:
"""
A self running doctest
>>> test_instagram_user()
"""
import os
if os.environ.get("CI"):
return # test failing on GitHub Actions
instagram_user = InstagramUser(username)
assert instagram_user.user_data
assert isinstance(instagram_user.user_data, dict)
assert instagram_user.username == username
if username != "github":
return
assert instagram_user.fullname == "GitHub"
assert instagram_user.biography == "Built for developers."
assert instagram_user.number_of_posts > 150
assert instagram_user.number_of_followers > 120000
assert instagram_user.number_of_followings > 15
assert instagram_user.email == "[email protected]"
assert instagram_user.website == "https://github.com/readme"
assert instagram_user.profile_picture_url.startswith("https://instagram.")
assert instagram_user.is_verified is True
assert instagram_user.is_private is False
if __name__ == "__main__":
import doctest
doctest.testmod()
instagram_user = InstagramUser("github")
print(instagram_user)
print(f"{instagram_user.number_of_posts = }")
print(f"{instagram_user.number_of_followers = }")
print(f"{instagram_user.number_of_followings = }")
print(f"{instagram_user.email = }")
print(f"{instagram_user.website = }")
print(f"{instagram_user.profile_picture_url = }")
print(f"{instagram_user.is_verified = }")
print(f"{instagram_user.is_private = }")