What will be scraped
Full Code
import os, requests, lxml, re, json, urllib.request
from bs4 import BeautifulSoup
from serpapi import GoogleSearch
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36"
params = {
"q": "mincraft wallpaper 4k", # search query
"tbm": "isch", # image results
"hl": "en", # language of the search
"gl": "us", # country where search comes from
"ijn": "0" # page number
html = requests.get("https://www.google.com/search", params=params, headers=headers, timeout=30)
soup = BeautifulSoup(html.text, "lxml")
def get_images_with_request_headers():
del params["ijn"]
params["content-type"] = "image/png" # parameter that indicate the original media type
return [img["src"] for img in soup.select("img")]
def get_suggested_search_data():
suggested_searches = []
all_script_tags = soup.select("script")
# https://regex101.com/r/48UZhY/6
matched_images = "".join(re.findall(r"AF_initDataCallback\(({key: 'ds:1'.*?)\);</script>", str(all_script_tags)))
# https://kodlogs.com/34776/json-decoder-jsondecodeerror-expecting-property-name-enclosed-in-double-quotes
# if you try to json.loads() without json.dumps it will throw an error:
# "Expecting property name enclosed in double quotes"
matched_images_data_fix = json.dumps(matched_images)
matched_images_data_json = json.loads(matched_images_data_fix)
# search for only suggested search thumbnails related
# https://regex101.com/r/ITluak/2
suggested_search_thumbnails = ",".join(re.findall(r'{key(.*?)\[null,\"Size\"', matched_images_data_json))
# https://regex101.com/r/MyNLUk/1
suggested_search_thumbnail_encoded = re.findall(r'\"(https:\/\/encrypted.*?)\"', suggested_search_thumbnails)
for suggested_search, suggested_search_fixed_thumbnail in zip(soup.select(".PKhmud.sc-it.tzVsfd"), suggested_search_thumbnail_encoded):
"name": suggested_search.select_one(".VlHyHc").text,
"link": f"https://www.google.com{suggested_search.a['href']}",
# https://regex101.com/r/y51ZoC/1
"chips": "".join(re.findall(r"&chips=(.*?)&", suggested_search.a["href"])),
# https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi
"thumbnail": bytes(suggested_search_fixed_thumbnail, "ascii").decode("unicode-escape")
return suggested_searches
def get_original_images():
if you try to json.loads() without json.dumps() it will throw an error:
"Expecting property name enclosed in double quotes"
google_images = []
all_script_tags = soup.select("script")
# # https://regex101.com/r/48UZhY/4
matched_images_data = "".join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags)))
matched_images_data_fix = json.dumps(matched_images_data)
matched_images_data_json = json.loads(matched_images_data_fix)
# https://regex101.com/r/pdZOnW/3
matched_google_image_data = re.findall(r'\[\"GRID_STATE0\",null,\[\[1,\[0,\".*?\",(.*),\"All\",', matched_images_data_json)
# https://regex101.com/r/NnRg27/1
matched_google_images_thumbnails = ", ".join(
str(matched_google_image_data))).split(", ")
thumbnails = [
bytes(bytes(thumbnail, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for thumbnail in matched_google_images_thumbnails
# removing previously matched thumbnails for easier full resolution image matches.
removed_matched_google_images_thumbnails = re.sub(
r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]', "", str(matched_google_image_data))
# https://regex101.com/r/fXjfb1/4
# https://stackoverflow.com/a/19821774/15164646
matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]", removed_matched_google_images_thumbnails)
full_res_images = [
bytes(bytes(img, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for img in matched_google_full_resolution_images
for index, (metadata, thumbnail, original) in enumerate(zip(soup.select('.isv-r.PNCib.MSM1fd.BUooTd'), thumbnails, full_res_images), start=1):
"title": metadata.select_one(".VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb")["title"],
"link": metadata.select_one(".VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb")["href"],
"source": metadata.select_one(".fxgdke").text,
"thumbnail": thumbnail,
"original": original
# Download original images
print(f'Downloading {index} image...')
opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36')]
urllib.request.urlretrieve(original, f'Bs4_Images/original_size_img_{index}.jpg')
return google_images
Install libraries:
pip install requests bs4 google-search-results
is a SerpApi API package that will be shown at the end as an alternative solution.
Basic knowledge scraping with CSS selectors
CSS selectors declare which part of the markup a style applies to thus allowing to extract data from matching tags and attributes.
If you haven't scraped with CSS selectors, there's a dedicated blog post of mine
about how to use CSS selectors when web-scraping that covers what it is, its pros and cons, and why they matter from a web-scraping perspective.
Reduce the chance of being blocked
There's a chance that a request might be blocked. Have a look
at how to reduce the chance of being blocked while web-scraping, there are eleven methods to bypass blocks from most websites.
Make sure to pass User-Agent
, because Google might block your requests eventually and you'll receive a different HTML thus empty output.
identifies the browser, its version number, and its host operating system that represents a person (browser) in a Web context that lets servers and network peers identify if it's a bot or not. And we're faking "real" user visit. Check what is your user-agent.
Code Explanation
Import libraries:
import os, requests, lxml, re, json, urllib.request
from bs4 import BeautifulSoup
from serpapi import GoogleSearch
Library | Purpose |
os |
to return environment variable (SerpApi API key) value. |
requests |
to make a request to the website. |
lxml |
to process XML/HTML documents fast. |
json |
to convert extracted data to a JSON object. |
re |
to extract parts of the data via regular expression. |
urllib.request |
to save images locally with urllib.request.urlretrieve
BeautifulSoup |
is a XML/HTML scraping library. It's used in combo with lxml as it faster than html.parser
Create URL parameter and request headers:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36"
params = {
"q": "mincraft wallpaper 4k", # search query
"tbm": "isch", # image results
"hl": "en", # language of the search
"gl": "us", # country where search comes from
"ijn": "0" # page number
Code | Explanation |
params |
a prettier way of passing URL parameters to a request. |
user-agent |
to act as a "real" user request from the browser by passing it to request headers. Default requests user-agent is a python-reqeusts so websites might understand that it's a bot or a script and block the request to the website. Check what's your user-agent . |
Make a request, pass created request parameters and headers. Pass returned HTML to BeautifulSoup
html = requests.get("https://www.google.com/search", params=params, headers=headers, timeout=30)
soup = BeautifulSoup(html.text, "lxml")
Code | Explanation |
timeout=30 |
to stop waiting for response after 30 seconds. |
BeautifulSoup(html.text, "lxml") |
html.text will return a textual HTML data and "lxml" will be set as a XML/HTML processor, not the default html.parser
Extracting data with request headers only, no regular expression the moment:
def get_images_with_request_headers():
params["content-type"] = "image/png" # parameter that indicate the original media type
return [img["src"] for img in soup.select("img")]
The reason why it's handy is beacuse when you try directly parse data from img
tag and src
attriubte, you'll get a base64 encoded URL which will be a 1x1 image placeholder. Not a particularly useful image resolution 🙂
Code | Explanation |
params["content-type"] |
will create a new dict key "content-type" and assinged a "image/png" value which will return images. |
[img["src"] for img in soup.select("img")] |
will iterate over all img tags and extracts src attriubte in a list comprehension loop and returned value would be a list of URLs from src attriubte. |
Print returned data:
Now to the suggested search results, a thing above actual images:
def get_suggested_search_data():
if you try to json.loads() without json.dumps it will throw an error:
"Expecting property name enclosed in double quotes"
suggested_searches = []
all_script_tags = soup.select("script")
# https://regex101.com/r/48UZhY/6
matched_images = "".join(re.findall(r"AF_initDataCallback\(({key: 'ds:1'.*?)\);</script>", str(all_script_tags)))
matched_images_data_fix = json.dumps(matched_images)
matched_images_data_json = json.loads(matched_images_data_fix)
# search for only suggested search thumbnails related
# https://regex101.com/r/ITluak/2
suggested_search_thumbnails = ",".join(re.findall(r'{key(.*?)\[null,\"Size\"', matched_images_data_json))
# https://regex101.com/r/MyNLUk/1
suggested_search_thumbnail_encoded = re.findall(r'\"(https:\/\/encrypted.*?)\"', suggested_search_thumbnails)
# zip() is used on purpose over zip_longest() as number of results would be identical
for suggested_search, suggested_search_fixed_thumbnail in zip(soup.select(".PKhmud.sc-it.tzVsfd"), suggested_search_thumbnail_encoded):
"name": suggested_search.select_one(".VlHyHc").text,
"link": f"https://www.google.com{suggested_search.a['href']}",
# https://regex101.com/r/y51ZoC/1
"chips": "".join(re.findall(r"&chips=(.*?)&", suggested_search.a["href"])),
# https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi
"thumbnail": bytes(suggested_search_fixed_thumbnail, "ascii").decode("unicode-escape")
return suggested_searches
Code | Explanation |
suggested_searches |
a temporary list where extracted data will be appended at the end of the function. |
all_script_tags |
a variable which will hold all extracted <script> HTML tags from soup.select("script") where select() will return a list of matched <script> tags. |
matched_images |
will hold all extracted matched images data from re.findall() which returns an iterator. This variable is needed to extract suggested search thumbnails, image thumbnails and full-resolution images. |
suggested_search_thumbnails and suggested_search_thumbnail_encoded
parses part of inline JSON where suggested_search_thumbnail_encoded parses actual thumbnails from partly parsed inline JSON data. |
zip() |
to iterate over multiple iterables in parralel. Keep in mind that zip is used on purpose. zip() ends with the shortest iterator while zip_longest() iterates up to the length of the longest iterator. |
suggested_searches.append({}) |
to append extracted images data to a list as a dictionary. |
select_one() |
to return one (instead of all) matched element in a loop. |
["href"] |
is a shortcut of accessing and extracting HTML attributes with BeautifulSoup . Alternative is get(<attribute>) . |
"".join() |
to join all items from in iterable into a string. |
bytes(<variable>, "ascii").decode("unicode-escape") |
to decode parsed image data. |
Printed returned data:
"name": "ultra hd",
"link": "https://www.google.com/search?q=minecraft+wallpaper+4k&tbm=isch&hl=en&gl=us&chips=q:minecraft+wallpaper+4k,g_1:ultra+hd:5VuluDYWa8Y%3D&sa=X&ved=2ahUKEwjshdCK0Yn5AhXrlWoFHYhyCrQQ4lYoAHoECAEQHQ",
"chips": "q:minecraft+wallpaper+4k,g_1:ultra+hd:5VuluDYWa8Y%3D",
"thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcThU0xo_GeIciyaBmvE6EI46tnj0npeDAmDsLKjYlnv4tGz0eaw&usqp=CAU"
"name": "epic",
"link": "https://www.google.com/search?q=minecraft+wallpaper+4k&tbm=isch&hl=en&gl=us&chips=q:minecraft+wallpaper+4k,g_1:epic:5c56RYLjq2c%3D&sa=X&ved=2ahUKEwjshdCK0Yn5AhXrlWoFHYhyCrQQ4lYoAXoECAEQHw",
"chips": "q:minecraft+wallpaper+4k,g_1:epic:5c56RYLjq2c%3D",
"thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQ_bUq-7tk9FyeNSW40Yo8FRY6SOViMbUeme_ln1uMwxcTdfI6d&usqp=CAU"
}, ... other results
Extracting original resolution images:
def get_original_images():
if you try to json.loads() without json.dumps() it will throw an error:
"Expecting property name enclosed in double quotes"
google_images = []
all_script_tags = soup.select("script")
# # https://regex101.com/r/48UZhY/4
matched_images_data = "".join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags)))
matched_images_data_fix = json.dumps(matched_images_data)
matched_images_data_json = json.loads(matched_images_data_fix)
# https://regex101.com/r/pdZOnW/3
matched_google_image_data = re.findall(r'\[\"GRID_STATE0\",null,\[\[1,\[0,\".*?\",(.*),\"All\",', matched_images_data_json)
# https://regex101.com/r/NnRg27/1
matched_google_images_thumbnails = ", ".join(
str(matched_google_image_data))).split(", ")
thumbnails = [
bytes(bytes(thumbnail, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for thumbnail in matched_google_images_thumbnails
# removing previously matched thumbnails for easier full resolution image matches.
removed_matched_google_images_thumbnails = re.sub(
r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]', "", str(matched_google_image_data))
# https://regex101.com/r/fXjfb1/4
# https://stackoverflow.com/a/19821774/15164646
matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]", removed_matched_google_images_thumbnails)
full_res_images = [
bytes(bytes(img, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for img in matched_google_full_resolution_images
for index, (metadata, thumbnail, original) in enumerate(zip(soup.select(".isv-r.PNCib.MSM1fd.BUooTd"), thumbnails, full_res_images), start=1):
"title": metadata.select_one(".VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb")["title"],
"link": metadata.select_one(".VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb")["href"],
"source": metadata.select_one(".fxgdke").text,
"thumbnail": thumbnail,
"original": original
# Download original images
print(f"Downloading {index} image...")
opener.addheaders=[("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36")]
urllib.request.urlretrieve(original, f"Bs4_Images/original_size_img_{index}.jpg")
return google_images
The process is almost identical to extracting suggested search results except for different regular expressions:
1. Create a temporary list
where extracted data will be appended.
2. Extracting all_script_tags
3. Extracting matched_images_data
to extract thumbnails and original resolution images.
4. Decode extracted encoded thumbnails
thumbnails = [
bytes(bytes(thumbnail, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for thumbnail in matched_google_images_thumbnails
# equvalent to
for fixed_google_image_thumbnail in matched_google_images_thumbnails:
# https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi
google_image_thumbnail_not_fixed = bytes(fixed_google_image_thumbnail, 'ascii').decode('unicode-escape')
# after first decoding, Unicode characters are still present. After the second iteration, they were decoded.
google_image_thumbnail = bytes(google_image_thumbnail_not_fixed, 'ascii').decode('unicode-escape')
5. Decode extracted encoded full_res_images
full_res_images = [
bytes(bytes(img, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for img in matched_google_full_resolution_images
# equvalent to
for index, fixed_full_res_image in enumerate(matched_google_full_resolution_images):
# https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi
original_size_img_not_fixed = bytes(fixed_full_res_image, 'ascii').decode('unicode-escape')
original_size_img = bytes(original_size_img_not_fixed, 'ascii').decode('unicode-escape')
Save full resolution images locally:
opener.addheaders=[("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36")]
urllib.request.urlretrieve(original, f"Bs4_Images/original_size_img_{index}.jpg")
Code | Explanation |
urllib.request.build_opener() |
manages the chaining of handlers and will automatically add headers on each request (row below). |
opener.addheaders[()] |
to add headers to the request. |
urllib.install_opener() |
set opener as a default global opener. Whatever that means 👀 |
urllib.request.urlretrieve() |
to save images locally. |
Printed returned data:
"title": "4K Minecraft Wallpapers | Background Images",
"link": "https://wall.alphacoders.com/tag/4k-minecraft-wallpapers",
"source": "wall.alphacoders.com",
"thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSJxrGh1FUsvCRNgKI4aiM8CimALQ0rHU2SDigSRl6X1c7BiWDOUMMMVCwyKtufB9SEddw&usqp=CAU",
"original": "https://images6.alphacoders.com/108/thumb-1920-1082090.jpg"
"title": "Best Minecraft Wallpaper 4k - Minecraft Tutos",
"link": "https://minecraft-tutos.com/en/minecraft-wallpaper/",
"source": "minecraft-tutos.com",
"thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTRDMguXava6khO5e5A0GQsm5v64rrJI_tYuSaJjyxWQNhTrhRWPRLLuhtPVouOUSaqzC0&usqp=CAU",
"original": "https://minecraft-tutos.com/wp-content/uploads/2022/03/wallpaper-minecraft-alex-steve-universe.jpeg"
}, ... other results
Using Google Images API
The main difference is that it's a quicker approach. No need to figure out regular expressions, create a parser and maintain it over time, or how to scale the number of requests without being blocked.
Example with pagination and multiple search queries:
def serpapi_get_google_images():
image_results = []
for query in ["Coffee", "boat", "skyrim", "minecraft"]:
# search query parameters
params = {
"engine": "google", # search engine. Google, Bing, Yahoo, Naver, Baidu...
"q": query, # search query
"tbm": "isch", # image results
"num": "100", # number of images per page
"ijn": 0, # page number: 0 -> first page, 1 -> second...
"api_key": os.getenv("API_KEY") # your serpapi api key
# other query parameters: hl (lang), gl (country), etc
search = GoogleSearch(params) # where data extraction happens
images_is_present = True
while images_is_present:
results = search.get_dict() # JSON -> Python dictionary
# checks for "Google hasn't returned any results for this query."
if "error" not in results:
for image in results["images_results"]:
if image["original"] not in image_results:
# update to the next page
params["ijn"] += 1
images_is_present = False
# -----------------------
# Downloading images
for index, image in enumerate(results["images_results"], start=1):
print(f"Downloading {index} image...")
opener.addheaders=[("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36")]
urllib.request.urlretrieve(image["original"], f"SerpApi_Images/original_size_img_{index}.jpg")
print(json.dumps(image_results, indent=2))
2349 # number of total extracted images
- Code in the online IDE
- Google Images API
- Github Gist
- Video API tutorial: Web Scraping all Google Images in Python and SerpApi
Add a Feature Request💫 or a Bug🐞
Top comments (0)