This article is only for educational purposes
IMDB providing its own api to get the movie details you can use that, this article is a web scraping example.
we will get the movie rating, number of rating, name and many more, for this we use BeautifulSoup and Requests packages
import requests
from bs4 import BeautifulSoup
boys_url = "https://www.imdb.com/title/tt1190634/"
r = requests.get(url=boys_url)
# create a BeautifulSoup object
soup = BeautifulSoup(r.text, 'html.parser')
to get the title of the html page
#page title
title = soup.find('title')
print(title.string)
data["title"] = title.string
soup.find this will look for the title tag and to get the value from it use title.string, this will return the string
Suppose you want to get all the div tags use find_all
soup.find_all('div')
to get a tag with class value
soup.find("div",{'class':'titleBar'})
full code to get the details in python dict, this function will get the html page in get method using requests then it is parsed as BeautifulSoup object and you can extract the data as you want
def getMovieDetails(url):
data = {}
r = requests.get(url=url)
# Create a BeautifulSoup object
soup = BeautifulSoup(r.text, 'html.parser')
#page title
title = soup.find('title')
data["title"] = title.string
# rating
ratingValue = soup.find("span", {"itemprop" : "ratingValue"})
data["ratingValue"] = ratingValue.string
# no of rating given
ratingCount = soup.find("span", {"itemprop" : "ratingCount"})
data["ratingCount"] = ratingCount.string
# name
titleName = soup.find("div",{'class':'titleBar'}).find("h1")
data["name"] = titleName.contents[0].replace(u'\xa0', u'')
# additional details
subtext = soup.find("div",{'class':'subtext'})
data["subtext"] = ""
for i in subtext.contents:
data["subtext"] += i.string.strip()
# summary
summary_text = soup.find("div",{'class':'summary_text'})
data["summary_text"] = summary_text.string.strip()
credit_summary_item = soup.find_all("div",{'class':'credit_summary_item'})
data["credits"] = {}
for i in credit_summary_item:
item = i.find("h4")
names = i.find_all("a")
data["credits"][item.string] = []
for i in names:
data["credits"][item.string].append({
"link": i["href"],
"name": i.string
})
return data
movies
tenet_url = "https://www.imdb.com/title/tt6723592/"
joker_url = "https://www.imdb.com/title/tt7286456/"
series
boys_url = "https://www.imdb.com/title/tt1190634/"
to get the movie details call this function
getMovieDetails(boys_url)
{'title': 'The Boys (TV Series 2019– ) - IMDb',
'ratingValue': '8.7',
'ratingCount': '173,133',
'name': 'The Boys ',
'subtext': '18+|1h|Action,Comedy,Crime|TV Series (2019– )',
'summary_text': 'A group of vigilantes sets out to take down corrupt superheroes who abuse their superpowers.',
'credits': {'Creator:': [{'link': '/name/nm0471392/', 'name': 'Eric Kripke'}],
'Stars:': [{'link': '/name/nm0881631/', 'name': 'Karl Urban'},
{'link': '/name/nm4425051/', 'name': 'Jack Quaid'},
{'link': '/name/nm1102278/', 'name': 'Antony Starr'},
{'link': 'fullcredits/', 'name': 'See full cast & crew'}]}}
Additional content
to get all the crew and cast member with their role
import re
def getCrewData(url):
crew_data = {
"crew": []
}
r = requests.get(url=url)
# Create a BeautifulSoup object
soup = BeautifulSoup(r.text, 'html.parser')
#page title
title = soup.find('title')
crew_data["title"] = title.string
cast_list = soup.find("table", {"class" : "cast_list"})
trows = cast_list.find_all('tr')
for tr in trows:
td = tr.find_all('td')
if len(td)==4:
row = [i.text for i in td]
crew_data["crew"].append({
"name":re.sub("[^a-zA-Z' ]+", '', row[1]).strip(),
"character":re.sub("[^a-zA-Z' ]+", '', row[3]).strip()
})
return crew_data
series
boys_url = "https://www.imdb.com/title/tt1190634/fullcredits/"
Movie
tenet_url = "https://www.imdb.com/title/tt6723592/fullcredits/"
getCrewData(tenet_url)
Conclusion: Web scraping is always not stable when changes made to the web pages it will affects the code logic which is previously build. Not all website allow you to scrape their contents, some use javascript to render the page(build one vue, react js) in that case use selenium to get the page rendered. Use this only for educational use.
Top comments (0)