🔨What is required: Understanding of loops, data structures, exception handling. serpapi
, pandas
, urllib
libraries.
⏱️How long will it take: ~15-30 minutes to read and implement.
What will be scraped
Prerequisites
Separate virtual environment
In short, it's a thing that creates an independent set of installed libraries including different Python versions that can coexist with each other at the same system thus prevention libraries or Python version conflicts.
If you didn't work with a virtual environment before, have a look at the dedicated Python virtual environments tutorial using Virtualenv and Poetry blog post of mine to get familiar.
Install libraries:
pip install pandas, google-search-results
Process
If you don't need an explanation:
- jump to the full code section,
- grab the full code from the GitHub repository,
- try it in the online IDE.
Scrape all Google Scholar Profile Results
import os
from serpapi import GoogleSearch
from urllib.parse import urlsplit, parse_qsl
import pandas as pd
def profile_results():
print("Extracting profile results..")
params = {
"api_key": os.getenv("API_KEY"), # SerpApi API key
"engine": "google_scholar_profiles", # profile results search engine
"mauthors": "blizzard", # search query
}
search = GoogleSearch(params)
profile_results_data = []
profiles_is_present = True
while profiles_is_present:
profile_results = search.get_dict()
for profile in profile_results.get("profiles", []):
print(f'Currently extracting {profile.get("name")} with {profile.get("author_id")} ID.')
thumbnail = profile.get("thumbnail")
name = profile.get("name")
link = profile.get("link")
author_id = profile.get("author_id")
affiliations = profile.get("affiliations")
email = profile.get("email")
cited_by = profile.get("cited_by")
interests = profile.get("interests")
profile_results_data.append({
"thumbnail": thumbnail,
"name": name,
"link": link,
"author_id": author_id,
"email": email,
"affiliations": affiliations,
"cited_by": cited_by,
"interests": interests
})
if "next" in profile_results["pagination"]:
search.params_dict.update(dict(parse_qsl(urlsplit(profile_results["pagination"]["next"]).query)))
else:
profiles_is_present = False
return profile_results_data
Scraping all profile results explanation
Import libraries:
import os
from serpapi import GoogleSearch
from urllib.parse import urlsplit, parse_qsl
import pandas as pd
Pass search parameters to SerpApi and create a temp list()
:
params = {
"api_key": os.getenv("API_KEY"), # SerpApi API key
"engine": "google_scholar_profiles", # profile results search engine
"mauthors": "blizzard", # search query
}
search = GoogleSearch(params)
profile_results_data = []
Set up a while
loop and add a if
statement to exit the while
loop if no pages left:
profiles_is_present = True
while profiles_is_present:
profile_results = search.get_dict()
# for loop extraction here..
# if next page in SerpApi pagination -> update params to new a page results.
# if no next page -> exit the while loop.
if "next" in profile_results.get("pagination", []):
search.params_dict.update(dict(parse_qsl(urlsplit(profile_results.get("pagination").get("next")).query)))
else:
profiles_is_present = False
Iterate over profile results in a for
loop:
for profile in profile_results.get("profiles", []):
print(f'Currently extracting {profile.get("name")} with {profile.get("author_id")} ID.')
thumbnail = profile.get("thumbnail")
name = profile.get("name")
link = profile.get("link")
author_id = profile.get("author_id")
affiliations = profile.get("affiliations")
email = profile.get("email")
cited_by = profile.get("cited_by")
interests = profile.get("interests")
Append extracted data to temporary list
as a dictionary and return
it:
profile_results_data.append({
"thumbnail": thumbnail,
"name": name,
"link": link,
"author_id": author_id,
"email": email,
"affiliations": affiliations,
"cited_by": cited_by,
"interests": interests
return profile_results_data
# example output:
'''
Extracting profile results..
Currently extracting Adam Lobel with _xwYD2sAAAAJ ID.
... other profiles
[
{
"thumbnail": "https://scholar.googleusercontent.com/citations?view_op=small_photo&user=_xwYD2sAAAAJ&citpid=3",
"name": "Adam Lobel",
"link": "https://scholar.google.com/citations?hl=en&user=_xwYD2sAAAAJ",
"author_id": "_xwYD2sAAAAJ",
"email": "Verified email at AdamLobel.com",
"affiliations": "Blizzard Entertainment",
"cited_by": 2935,
"interests": [
{
"title": "Gaming",
"serpapi_link": "https://serpapi.com/search.json?engine=google_scholar_profiles&hl=en&mauthors=label%3Agaming",
"link": "https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors=label:gaming"
},
{
"title": "Emotion regulation",
"serpapi_link": "https://serpapi.com/search.json?engine=google_scholar_profiles&hl=en&mauthors=label%3Aemotion_regulation",
"link": "https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors=label:emotion_regulation"
}
]
},
... other profiles
]
'''
Scrape Google Scholar Author Results
import os
from serpapi import GoogleSearch
from google_scholar_profile_results import profile_results
from urllib.parse import urlsplit, parse_qsl
import pandas as pd
def author_results():
print("extracting author results..")
author_results_data = []
for author_id in profile_results():
print(f"Parsing {author_id['author_id']} author ID.")
params = {
"api_key": os.getenv("API_KEY"), # SerpApi API key
"engine": "google_scholar_author", # author results search engine
"author_id": author_id["author_id"], # search query
"hl": "en"
}
search = GoogleSearch(params)
results = search.get_dict()
thumbnail = results.get("author").get("thumbnail")
name = results.get("author").get("name")
affiliations = results.get("author").get("affiliations")
email = results.get("author").get("email")
website = results.get("author").get("website")
interests = results.get("author").get("interests")
cited_by_table = results.get("cited_by", {}).get("table")
cited_by_graph = results.get("cited_by", {}).get("graph")
public_access_link = results.get("public_access", {}).get("link")
available_public_access = results.get("public_access", {}).get("available")
not_available_public_access = results.get("public_access", {}).get("not_available")
co_authors = results.get("co_authors")
author_results_data.append({
"thumbnail": thumbnail,
"name": name,
"affiliations": affiliations,
"email": email,
"website": website,
"interests": interests,
"cited_by_table": cited_by_table,
"cited_by_graph": cited_by_graph,
"public_access_link": public_access_link,
"available_public_access": available_public_access,
"not_available_public_access": not_available_public_access,
"co_authors": co_authors
})
return author_results_data
Scraping author results explanation
Import profile_results()
function and other libraries:
import os
from serpapi import GoogleSearch
from google_scholar_profile_results import profile_results
from urllib.parse import urlsplit, parse_qsl
import pandas as pd
profile_results()
will iterate over all available pages and return a dictionary including author ID result, for example _xwYD2sAAAAJ
.
Create temporary list
to store extracted data:
author_results_data = []
Iterate over extracted profiles, passauthor_id
to author_id
search parameter:
for author_id in profile_results():
print(f"Parsing {author_id['author_id']} author ID.")
params = {
"api_key": os.getenv("API_KEY"), # SerpApi API key
"engine": "google_scholar_author", # author results search engine
"author_id": author_id["author_id"], # search query: _xwYD2sAAAAJ
"hl": "en"
}
search = GoogleSearch(params)
results = search.get_dict()
Extract the data:
thumbnail = results.get("author").get("thumbnail")
name = results.get("author").get("name")
affiliations = results.get("author").get("affiliations")
email = results.get("author").get("email")
website = results.get("author").get("website")
interests = results.get("author").get("interests")
cited_by_table = results.get("cited_by", {}).get("table")
cited_by_graph = results.get("cited_by", {}).get("graph")
public_access_link = results.get("public_access", {}).get("link")
available_public_access = results.get("public_access", {}).get("available")
not_available_public_access = results.get("public_access", {}).get("not_available")
co_authors = results.get("co_authors")
Append extracted data to temporary list
as a dictionary and return
it:
author_results_data.append({
"thumbnail": thumbnail,
"name": name,
"affiliations": affiliations,
"email": email,
"website": website,
"interests": interests,
"cited_by_table": cited_by_table,
"cited_by_graph": cited_by_graph,
"public_access_link": public_access_link,
"available_public_access": available_public_access,
"not_available_public_access": not_available_public_access,
"co_authors": co_authors
})
return author_results_data
# example output:
'''
extracting author results..
Extracting profile results..
Currently extracting Adam Lobel with _xwYD2sAAAAJ ID.
... other authors
Parsing _xwYD2sAAAAJ author ID.
... other authors
[
{
"thumbnail": "https://scholar.googleusercontent.com/citations?view_op=view_photo&user=_xwYD2sAAAAJ&citpid=3",
"name": "Adam Lobel",
"affiliations": "Blizzard Entertainment",
"email": "Verified email at AdamLobel.com",
"website": "https://twitter.com/GrowingUpGaming",
"interests": [
{
"title": "Gaming",
"link": "https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:gaming",
"serpapi_link": "https://serpapi.com/search.json?engine=google_scholar_profiles&hl=en&mauthors=label%3Agaming"
},
{
"title": "Emotion regulation",
"link": "https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:emotion_regulation",
"serpapi_link": "https://serpapi.com/search.json?engine=google_scholar_profiles&hl=en&mauthors=label%3Aemotion_regulation"
}
],
"cited_by_table": [
{
"citations": {
"all": 2935,
"since_2017": 2348
}
},
{
"h_index": {
"all": 10,
"since_2017": 10
}
},
{
"i10_index": {
"all": 11,
"since_2017": 10
}
}
],
"cited_by_graph": [
{
"year": 2014,
"citations": 70
},
{
"year": 2015,
"citations": 188
},
{
"year": 2016,
"citations": 243
},
{
"year": 2017,
"citations": 342
},
{
"year": 2018,
"citations": 420
},
{
"year": 2019,
"citations": 553
},
{
"year": 2020,
"citations": 507
},
{
"year": 2021,
"citations": 504
},
{
"year": 2022,
"citations": 16
}
],
"public_access_link": "https://scholar.google.com/citations?view_op=list_mandates&hl=en&user=_xwYD2sAAAAJ",
"available_public_access": 1,
"not_available_public_access": 0,
"co_authors": [
{
"name": "Isabela Granic",
"link": "https://scholar.google.com/citations?user=4T5cjVIAAAAJ&hl=en",
"serpapi_link": "https://serpapi.com/search.json?author_id=4T5cjVIAAAAJ&engine=google_scholar_author&hl=en",
"author_id": "4T5cjVIAAAAJ",
"affiliations": "Radboud University Nijmegen",
"email": "Verified email at pwo.ru.nl",
"thumbnail": "https://scholar.googleusercontent.com/citations?view_op=small_photo&user=4T5cjVIAAAAJ&citpid=4"
},
... other co-authors
}
]
}
... other authors
]
'''
Scrape all Author Articles from Google Scholar
import os
from serpapi import GoogleSearch
from google_scholar_profile_results import profile_results
from urllib.parse import urlsplit, parse_qsl
import pandas as pd
def all_author_articles():
author_article_results_data = []
for index, author_id in enumerate(profile_results(), start=1):
print(f"Parsing {index} author with {author_id['author_id']} author ID.")
params = {
"api_key": os.getenv("API_KEY"), # SerpApi API key
"engine": "google_scholar_author", # author results search engine
"hl": "en", # language
"sort": "pubdate", # sort by year
"author_id": author_id["author_id"] # search query
}
search = GoogleSearch(params)
articles_is_present = True
while articles_is_present:
results = search.get_dict()
for article in results.get("articles", []):
title = article.get("title")
link = article.get("link")
citation_id = article.get("citation_id")
authors = article.get("authors")
publication = article.get("publication")
cited_by_value = article.get("cited_by", {}).get("value")
cited_by_link = article.get("cited_by", {}).get("link")
cited_by_cites_id = article.get("cited_by", {}).get("cites_id")
year = article.get("year")
author_article_results_data.append({
"article_title": title,
"article_link": link,
"article_year": year,
"article_citation_id": citation_id,
"article_authors": authors,
"article_publication": publication,
"article_cited_by_value": cited_by_value,
"article_cited_by_link": cited_by_link,
"article_cited_by_cites_id": cited_by_cites_id,
})
if "next" in results.get("serpapi_pagination", []):
search.params_dict.update(dict(parse_qsl(urlsplit(results.get("serpapi_pagination").get("next")).query)))
else:
articles_is_present = False
return author_article_results_data
Scraping all author articles explanation
Import profile_results()
function and other libraries:
import os
from serpapi import GoogleSearch
from google_scholar_profile_results import profile_results
from urllib.parse import urlsplit, parse_qsl
import pandas as pd
In this case profile_results()
was used to get author_id
as well, in order to parse author articles.
Create temporary list
to store extracted data:
author_article_results_data = []
Iterate over profile_results()
and pass author_id
to parameter search query:
for index, author_id in enumerate(profile_results(), start=1):
print(f"Parsing {index} author with {author_id['author_id']} author ID.")
params = {
"api_key": os.getenv("API_KEY"), # SerpApi API key
"engine": "google_scholar_author", # author results search engine
"hl": "en", # language
"sort": "pubdate", # sort by year
"author_id": author_id["author_id"] # search query
}
search = GoogleSearch(params)
Set up a while
loop and check if
next page is present:
articles_is_present = True
while articles_is_present:
results = search.get_dict()
# data extraction code..
# if next page is present -> update previous results to new page results.
# if next page is not present -> exit the while loop.
if "next" in results.get("serpapi_pagination", []):
search.params_dict.update(dict(parse_qsl(urlsplit(results.get("serpapi_pagination").get("next")).query)))
else:
articles_is_present = False
Extract data in a for
loop:
for article in results.get("articles", []):
title = article.get("title")
link = article.get("link")
citation_id = article.get("citation_id")
authors = article.get("authors")
publication = article.get("publication")
cited_by_value = article.get("cited_by", {}).get("value")
cited_by_link = article.get("cited_by", {}).get("link")
cited_by_cites_id = article.get("cited_by", {}).get("cites_id")
year = article.get("year")
Append
extracted data to temporary list
as a dictionary:
author_article_results_data.append({
"article_title": title,
"article_link": link,
"article_year": year,
"article_citation_id": citation_id,
"article_authors": authors,
"article_publication": publication,
"article_cited_by_value": cited_by_value,
"article_cited_by_link": cited_by_link,
"article_cited_by_cites_id": cited_by_cites_id,
})
Return
extracted data:
return author_article_results_data
Save Google Scholar Profile and Author Results to CSV
from google_scholar_profile_results import profile_results
import pandas as pd
def save_profile_results_to_csv():
print("Waiting for profile results to save..")
pd.DataFrame(data=profile_results()).to_csv("google_scholar_profile_results.csv", encoding="utf-8", index=False)
print("Profile Results Saved.")
def save_author_result_to_csv():
print("Waiting for author results to save..")
pd.DataFrame(data=profile_results()).to_csv("google_scholar_author_results.csv", encoding="utf-8", index=False)
print("Author Results Saved.")
def save_author_articles_to_csv():
print("Waiting for author articles to save..")
pd.DataFrame(data=profile_results()).to_csv("google_scholar_author_articles.csv", encoding="utf-8", index=False)
print("Author Articles Saved.")
-
data
argument insideDataFrame
is your data. -
encoding='utf-8'
argument just to make sure everything will be saved correctly. I used it explicitly even thought it's a default value. -
index=False
argument to drop defaultpandas
row numbers.
Full Code
import os
from serpapi import GoogleSearch
from urllib.parse import urlsplit, parse_qsl
import pandas as pd
def profile_results():
print("Extracting profile results..")
params = {
"api_key": os.getenv("API_KEY"), # SerpApi API key
"engine": "google_scholar_profiles", # profile results search engine
"mauthors": "blizzard", # search query
}
search = GoogleSearch(params)
profile_results_data = []
profiles_is_present = True
while profiles_is_present:
profile_results = search.get_dict()
for profile in profile_results.get("profiles", []):
print(f'Currently extracting {profile.get("name")} with {profile.get("author_id")} ID.')
thumbnail = profile.get("thumbnail")
name = profile.get("name")
link = profile.get("link")
author_id = profile.get("author_id")
affiliations = profile.get("affiliations")
email = profile.get("email")
cited_by = profile.get("cited_by")
interests = profile.get("interests")
profile_results_data.append({
"thumbnail": thumbnail,
"name": name,
"link": link,
"author_id": author_id,
"email": email,
"affiliations": affiliations,
"cited_by": cited_by,
"interests": interests
})
if "next" in profile_results.get("pagination", []):
search.params_dict.update(dict(parse_qsl(urlsplit(profile_results.get("pagination").get("next")).query)))
else:
profiles_is_present = False
return profile_results_data
def author_results():
print("extracting author results..")
author_results_data = []
for author_id in profile_results():
print(f"Parsing {author_id['author_id']} author ID.")
params = {
"api_key": os.getenv("API_KEY"), # SerpApi API key
"engine": "google_scholar_author", # author results search engine
"author_id": author_id["author_id"], # search query
"hl": "en"
}
search = GoogleSearch(params)
results = search.get_dict()
thumbnail = results.get("author").get("thumbnail")
name = results.get("author").get("name")
affiliations = results.get("author").get("affiliations")
email = results.get("author").get("email")
website = results.get("author").get("website")
interests = results.get("author").get("interests")
cited_by_table = results.get("cited_by", {}).get("table")
cited_by_graph = results.get("cited_by", {}).get("graph")
public_access_link = results.get("public_access", {}).get("link")
available_public_access = results.get("public_access", {}).get("available")
not_available_public_access = results.get("public_access", {}).get("not_available")
co_authors = results.get("co_authors")
author_results_data.append({
"thumbnail": thumbnail,
"name": name,
"affiliations": affiliations,
"email": email,
"website": website,
"interests": interests,
"cited_by_table": cited_by_table,
"cited_by_graph": cited_by_graph,
"public_access_link": public_access_link,
"available_public_access": available_public_access,
"not_available_public_access": not_available_public_access,
"co_authors": co_authors
})
return author_results_data
def all_author_articles():
author_article_results_data = []
for index, author_id in enumerate(profile_results(), start=1):
print(f"Parsing author #{index} with {author_id['author_id']} author ID.")
params = {
"api_key": os.getenv("API_KEY"), # SerpApi API key
"engine": "google_scholar_author", # author results search engine
"hl": "en", # language
"sort": "pubdate", # sort by year
"author_id": author_id["author_id"] # search query
}
search = GoogleSearch(params)
articles_is_present = True
while articles_is_present:
results = search.get_dict()
for article in results.get("articles", []):
title = article.get("title")
link = article.get("link")
citation_id = article.get("citation_id")
authors = article.get("authors")
publication = article.get("publication")
cited_by_value = article.get("cited_by", {}).get("value")
cited_by_link = article.get("cited_by", {}).get("link")
cited_by_cites_id = article.get("cited_by", {}).get("cites_id")
year = article.get("year")
author_article_results_data.append({
"article_title": title,
"article_link": link,
"article_year": year,
"article_citation_id": citation_id,
"article_authors": authors,
"article_publication": publication,
"article_cited_by_value": cited_by_value,
"article_cited_by_link": cited_by_link,
"article_cited_by_cites_id": cited_by_cites_id,
})
if "next" in results.get("serpapi_pagination", []):
search.params_dict.update(dict(parse_qsl(urlsplit(results.get("serpapi_pagination").get("next")).query)))
else:
articles_is_present = False
return author_article_results_data
def save_author_result_to_csv():
print("Waiting for author results to save..")
pd.DataFrame(data=profile_results()).to_csv("google_scholar_author_results.csv", encoding="utf-8", index=False)
print("Author Results Saved.")
def save_author_articles_to_csv():
print("Waiting for author articles to save..")
pd.DataFrame(data=profile_results()).to_csv("google_scholar_author_articles.csv", encoding="utf-8", index=False)
print("Author Articles Saved.")
def save_profile_results_to_csv():
print("Waiting for profile results to save..")
pd.DataFrame(data=profile_results()).to_csv("google_scholar_profile_results.csv", encoding="utf-8", index=False)
print("Profile Results Saved.")
Links
Outro
If your goal is to extract data without the need to write a parser from scratch, figure out how to bypass blocks from search engines, how to scale it or how to extract data from JavaScript - have a try SerpApi.
If you have anything to share, any questions, suggestions, or something that isn't working correctly, feel free to drop a comment in the comment section or reach out via Twitter at @dimitryzub, or @serp_api.
Join us on Reddit | Twitter | YouTube
Add a Feature Request💫 or a Bug🐞
Top comments (0)