What will be scraped
Full Code
import requests, json, re
from parsel import Selector
def scrape_google_finance_main_page():
# https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
# https://www.whatismybrowser.com/detect/what-is-my-user-agent
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.134 Safari/537.36"
}
html = requests.get(f"https://www.google.com/finance/", headers=headers, timeout=30)
selector = Selector(text=html.text)
# where all extracted data will be temporary located
ticker_data = {
"market_trends": [],
"interested_in": {
"top_position": [],
"bottom_position": []
},
"earning_calendar": [],
"most_followed_on_google": [],
"news": [],
}
# Market trends top results
ticker_data["market_trends"] = selector.css(".gR2U6::text").getall()
# Earnings calendar results
for calendar_quote in selector.css(".d3fRjc"):
ticker_data["earning_calendar"].append({
"quote": calendar_quote.css(".yaubCc::text").get(),
"quote_link": f'https://www.google.com/finance/quote{calendar_quote.css(".yaubCc::attr(href)").get().replace("./quote/", "/")}',
"short_date": calendar_quote.css(".JiAI5b").xpath("normalize-space()").get(),
"full_date": calendar_quote.css(".fVovwd::text").get()
})
# Most followed on Google results
for google_most_followed in selector.css(".NaLFgc"):
current_percent_change_raw_value = google_most_followed.css("[jsname=Fe7oBc]::attr(aria-label)").get()
current_percent_change = re.search(r"by\s?(\d+\.\d+)%", google_most_followed.css("[jsname=Fe7oBc]::attr(aria-label)").get()).group(1)
ticker_data["most_followed_on_google"].append({
"title": google_most_followed.css(".TwnKPb::text").get(),
"quote": re.search(r"\.\/quote\/(\w+):",google_most_followed.attrib["href"]).group(1), # https://regex101.com/r/J3DDIX/1
"following": re.search(r"(\d+\.\d+)M", google_most_followed.css(".Iap8Fc::text").get()).group(1), # https://regex101.com/r/7ptVha/1
"percent_price_change": f"+{current_percent_change}" if "Up" in current_percent_change_raw_value else f"-{current_percent_change}"
})
# news results. If empty -> run once again. For some reason it could return [].
for index, news in enumerate(selector.css(".yY3Lee"), start=1):
ticker_data["news"].append({
"position": index,
"title": news.css(".Yfwt5::text").get(),
"link": news.css(".z4rs2b a::attr(href)").get(),
"source": news.css(".sfyJob::text").get(),
"published": news.css(".Adak::text").get(),
"thumbnail": news.css("img.Z4idke::attr(src)").get()
})
# "you may be interested in" at the top of the page results
for index, interested_top in enumerate(selector.css(".sbnBtf:not(.xJvDsc) .SxcTic"), start=1):
current_percent_change_raw_value = interested_top.css("[jsname=Fe7oBc]::attr(aria-label)").get()
current_percent_change = re.search(r"\d{1}%|\d{1,10}\.\d{1,2}%", interested_top.css("[jsname=Fe7oBc]::attr(aria-label)").get()).group()
ticker_data["interested_in"]["top_position"].append({
"index": index,
"title": interested_top.css(".ZvmM7::text").get(),
"quote": interested_top.css(".COaKTb::text").get(),
"price_change": interested_top.css(".SEGxAb .P2Luy::text").get(),
"percent_price_change": f"+{current_percent_change}" if "Up" in current_percent_change_raw_value else f"-{current_percent_change}"
})
# "you may be interested in" at the bottom of the page results
for index, interested_bottom in enumerate(selector.css(".HDXgAf .tOzDHb"), start=1):
# single function to handle both top and bottom
# "you may be interested results" as selectors is identical
current_percent_change_raw_value = interested_bottom.css("[jsname=Fe7oBc]::attr(aria-label)").get()
current_percent_change = re.search(r"\d{1}%|\d{1,10}\.\d{1,2}%", interested_bottom.css("[jsname=Fe7oBc]::attr(aria-label)").get()).group()
ticker_data["interested_in"]["bottom_position"].append({
"position": index,
"ticker": interested_bottom.css(".COaKTb::text").get(),
"ticker_link": f'https://www.google.com/finance{interested_bottom.attrib["href"].replace("./", "/")}',
"title": interested_bottom.css(".RwFyvf::text").get(),
"price": interested_bottom.css(".YMlKec::text").get(),
"percent_price_change": f"+{current_percent_change}" if "Up" in current_percent_change_raw_value else f"-{current_percent_change}"
})
return ticker_data
print(json.dumps(scrape_google_finance_main_page(), indent=2, ensure_ascii=False))
Prerequisites
Install libraries:
pip install requests parsel
Basic knowledge scraping with CSS selectors
CSS selectors declare which part of the markup a style applies to thus allowing to extract data from matching tags and attributes.
If you haven't scraped with CSS selectors, there's a dedicated blog post of mine
about how to use CSS selectors when web-scraping that covers what it is, pros and cons, and why they matter from a web-scraping perspective.
Separate virtual environment
In short, it's a thing that creates an independent set of installed libraries including different Python versions that can coexist with each other in the same system thus preventing libraries or Python version conflicts.
If you didn't work with a virtual environment before, have a look at the
dedicated Python virtual environments tutorial using Virtualenv and Poetry blog post of mine to get a little bit more familiar.
📌Note: this is not a strict requirement for this blog.
Reduce the chance of being blocked
There's a chance that a request might be blocked. Have a look
at how to reduce the chance of being blocked while web-scraping, there are eleven methods to bypass blocks from most websites.
Code Explanation
Import libraries:
import requests, json, re
from parsel import Selector
Library | Purpose |
---|---|
requests |
to make a request to the website. |
json |
to convert extracted data to a JSON object. |
re |
to extract parts of the data via regular expression. |
parsel |
to parse data from HTML/XML documents. Similar to BeautifulSoup . |
Create request headers:
# https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
# https://www.whatismybrowser.com/detect/what-is-my-user-agent
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36",
}
Library | Purpose |
---|---|
user-agent |
to act as a "real" user request from the browser by passing it to request headers. Check what's your user-agent . |
Pass requests headers to requests.get()
while making a request, and pass response to parsel
:
html = requests.get(f"https://www.google.com/finance/", headers=headers, timeout=30)
selector = Selector(text=html.text)
Code | Explanation |
---|---|
timeout=30 |
to stop waiting for response after 30 seconds. |
Selector(text=html.text) |
where passed HTML from the response will be processed by parsel . |
text= |
is a parsel argument that accepts str object from where HTML nodes will be extracted. |
Create an empty dictionary structure where all the data will be filled in later:
# where all extracted data will be temporary located
ticker_data = {
"market_trends": [],
"interested_in": {
"top_position": [],
"bottom_position": []
},
"earning_calendar": [],
"most_followed_on_google": [],
"news": [],
}
Extracting data:
# Market trends results
ticker_data["market_trends"] = selector.css(".gR2U6::text").getall()
# Earnings calendar results
for calendar_quote in selector.css(".d3fRjc"):
ticker_data["earning_calendar"].append({
"quote": calendar_quote.css(".yaubCc::text").get(),
"quote_link": f'https://www.google.com/finance/quote{calendar_quote.css(".yaubCc::attr(href)").get().replace("./quote/", "/")}',
"short_date": calendar_quote.css(".JiAI5b").xpath("normalize-space()").get(),
"full_date": calendar_quote.css(".fVovwd::text").get()
})
# Most followed on Google results
for google_most_followed in selector.css(".NaLFgc"):
current_percent_change_raw_value = google_most_followed.css("[jsname=Fe7oBc]::attr(aria-label)").get()
current_percent_change = re.search(r"by\s?(\d+\.\d+)%", google_most_followed.css("[jsname=Fe7oBc]::attr(aria-label)").get()).group(1)
ticker_data["most_followed_on_google"].append({
"title": google_most_followed.css(".TwnKPb::text").get(),
"quote": re.search(r"\.\/quote\/(\w+):",google_most_followed.attrib["href"]).group(1), # https://regex101.com/r/J3DDIX/1
"following": re.search(r"(\d+\.\d+)M", google_most_followed.css(".Iap8Fc::text").get()).group(1), # https://regex101.com/r/7ptVha/1
"percent_price_change": f"+{current_percent_change}" if "Up" in current_percent_change_raw_value else f"-{current_percent_change}"
})
# news results. If empty -> run once again. For some reason it could return [].
for index, news in enumerate(selector.css(".yY3Lee"), start=1):
ticker_data["news"].append({
"position": index,
"title": news.css(".Yfwt5::text").get(),
"link": news.css(".z4rs2b a::attr(href)").get(),
"source": news.css(".sfyJob::text").get(),
"published": news.css(".Adak::text").get(),
"thumbnail": news.css("img.Z4idke::attr(src)").get()
})
# "you may be interested in" at the bottom of the page results
for index, interested_top in enumerate(selector.css(".sbnBtf:not(.xJvDsc) .SxcTic"), start=1):
current_percent_change_raw_value = interested_top.css("[jsname=Fe7oBc]::attr(aria-label)").get()
current_percent_change = re.search(r"\d{1}%|\d{1,10}\.\d{1,2}%", interested_top.css("[jsname=Fe7oBc]::attr(aria-label)").get()).group()
ticker_data["interested_in"]["top_position"].append({
"index": index,
"title": interested_top.css(".ZvmM7::text").get(),
"quote": interested_top.css(".COaKTb::text").get(),
"price_change": interested_top.css(".SEGxAb .P2Luy::text").get(),
"percent_price_change": f"+{current_percent_change}" if "Up" in current_percent_change_raw_value else f"-{current_percent_change}"
})
# "you may be interested in" at the bottom of the page results
for index, interested_bottom in enumerate(selector.css(".HDXgAf .tOzDHb"), start=1):
# single function to handle both top and bottom
# "you may be interested results" as selectors is identical
current_percent_change_raw_value = interested_bottom.css("[jsname=Fe7oBc]::attr(aria-label)").get()
current_percent_change = re.search(r"\d{1}%|\d{1,10}\.\d{1,2}%", interested_bottom.css("[jsname=Fe7oBc]::attr(aria-label)").get()).group()
ticker_data["interested_in"]["bottom_position"].append({
"position": index,
"ticker": interested_bottom.css(".COaKTb::text").get(),
"ticker_link": f'https://www.google.com/finance{interested_bottom.attrib["href"].replace("./", "/")}',
"title": interested_bottom.css(".RwFyvf::text").get(),
"price": interested_bottom.css(".YMlKec::text").get(),
"percent_price_change": f"+{current_percent_change}" if "Up" in current_percent_change_raw_value else f"-{current_percent_change}"
})
Code | Explanation |
---|---|
ticker_data["market_trends"] |
accesses ["ticker_data"] key and creates a new key ["market_trends"] and assigns it to whatever value would be extracted by parsel . |
css() |
to parse data from the passed CSS selector(s). Every CSS query traslates to XPath using csselect package under the hood. |
[jsname=Fe7oBc] |
is a CSS selector that used to select elements with the specified attribute and value e.g. [attribute=value] . |
append() |
to append extracted data to the list as dictionary. |
getall() |
to get all a list of matches. |
get() |
to get actual data. |
::text or ::attr(<attribute>) |
to extract textual or attribute data from the node. |
xpath("normalize-space()") |
to parse blank text node as well. By default, blank text node is be skipped by XPath. |
replace("<something>", "<with_something>") |
to replace something old with something new in a string. |
enumerate() |
to add a counter to an iterable and return it. start=1 will start counting from 1, instead from the default value of 0. |
re.search() |
to match parts of the string and grab only digit values. group() to return matched string by a regular expression. |
Return and print the data:
# ticker_data = {
# "market_trends": [],
# "interested_in": {
# "top_position": [],
# "bottom_position": []
# },
# "earning_calendar": [],
# "most_followed_on_google": [],
# "news": [],
# }
# extraction code...
return ticker_data
print(json.dumps(scrape_google_finance_main_page(), indent=2, ensure_ascii=False))
Full output:
{
"market_trends": {
"top_position": [
"Market indexes",
"Most active",
"Gainers",
"Losers",
"Climate leaders",
"Crypto",
"Currencies"
],
"bottom_position": [
{
"index": 1,
"title": "Tesla Inc",
"quote": "TSLA",
"price": "$824.46",
"price_percent_change": "+0.59%"
}, ... other results
{
"index": 6,
"title": "BEL 20",
"quote": "Index",
"price": "3,774.05",
"price_percent_change": "+1.15%"
}
]
},
"interested_in": {
"top_position": [
{
"index": 1,
"title": "Tesla Inc",
"quote": "TSLA",
"price_change": "+$47.88",
"percent_price_change": "+6.17%"
}, ... other results
{
"index": 6,
"title": "BEL 20",
"quote": "Index",
"price_change": "+22.01",
"percent_price_change": "+0.59%"
}
],
"bottom_position": [
{
"position": 1,
"ticker": "Index",
"ticker_link": "https://www.google.com/finance/quote/BEL20:INDEXEURO",
"title": "BEL 20",
"price": "3,774.05",
"percent_price_change": "+0.59%"
}, ... other results
{
"position": 18,
"ticker": "PFE",
"ticker_link": "https://www.google.com/finance/quote/PFE:NYSE",
"title": "Pfizer Inc.",
"price": "$51.95",
"percent_price_change": "-0.67%"
}
]
},
"earning_calendar": [
{
"quote": "Apple",
"quote_link": "https://www.google.com/finance/quote/AAPL:NASDAQ",
"short_date": "Jul28",
"full_date": "Jul 28, 2022, 11:00 PM"
}, ... other results
{
"quote": "Occidental Petroleum",
"quote_link": "https://www.google.com/finance/quote/OXY:NYSE",
"short_date": "Aug2",
"full_date": "Aug 2, 2022, 10:00 PM"
}
],
"most_followed_on_google": [
{
"title": "Apple Inc",
"quote": "AAPL",
"following": "3.71",
"percent_price_change": "+3.42"
}, ... other results
{
"title": "Tesla Inc",
"quote": "TSLA",
"following": "1.49",
"percent_price_change": "+6.17"
}
],
"news": [
{
"position": 1,
"title": "This kind of shock to the economy will have consequences",
"link": "https://www.cnn.com/2022/07/27/politics/fed-interest-rate-volcker-what-matters/index.html",
"source": "CNN",
"published": "10 hours ago",
"thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRcLNm7uU5YfuvveVMWNvlQGUMcCPi4-7QJAqfKcDJgq7A3n1E_wiy53--_FFA"
}, ... other news
{
"position": 9,
"title": "The 20 Best Netflix Shows of All Time -- Ranked",
"link": "https://www.rollingstone.com/tv-movies/tv-movie-lists/best-netflix-shows-1386323/",
"source": "Rolling Stone",
"published": "20 hours ago",
"thumbnail": "https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcSsaABpxAxYW29MTnyeSHb1Z9ex1bMvXQQnFB5RJqz9LogWOR9zyOKw9YrjClI"
}
]
}
Links
Add a Feature Request💫 or a Bug🐞
Top comments (0)