In this post we will be scraping questions and answers from this site.
- The site contains 1000 Python questions seperated by topics.
- Each topic is on its own page but links to all the topics are on this page. This page will be referred to as the index page in this article.
PS: This code is just a file in a project I'm currently working on. The project is a desktop app that generates MCQ questions for a Python course I'm teaching.
The code for this article is found in the scrape_sanfoundry_questions file in this GitHub repo. The code in this article is not exactly like that in the repo (I've made some modifications here to ease comprehension).
NB: Look in the comments and multiline comments (triple-quoted strings """
) for explanations of the different code sections. I thought this approach would be more intuitive as this site doesn't include line numbers in the code sections.
Requirements
- playwright:
pip install playwright
and thenplaywright install chromium
(weighs about 150MB). You can also look at how to make playwright work with your browser in your computer. Read the documentation - beautifulsoup:
pip install beautifulsoup
Workflow
The basic algorithm for this is as follows
- Open the 'index' page, scrape the questions and answers from it and save to a text file.
- Get all the links to the other topics' questions.
- For each link; open the page, scrape the questions and answers and save them to a text file.
Code Walkthrough
1. Initialization
import re
import os
from playwright.sync_api import Playwright, sync_playwright, expect
from bs4 import BeautifulSoup
# used to check if a particular text is a question
question_regex = re.compile("^(\d+\.)((\s*\w*\s*)*) (?<![\r])")
questions_directory_path = os.path.join(
os.path.dirname(__file__), "questions"
)
if not os.path.exists(questions_directory_path):
os.makedirs(questions_directory_path)
already_downloaded_questions = set([
f.strip(".txt") for f in os.listdir(questions_directory_path)
])
url = "https://www.sanfoundry.com/1000-python-questions-answers/"
2. Functions
Our code will have 3 functions:
- get_questions_from_page: takes a page object (more on that later), a url, and a topic name. It creates a soup object (used for selecting html elements from the page), generates a path to store the scraped QAs (questions and answers) and calls the get_questions_from_soup function.
- get_questions_from_soup: takes a soup object, a topic string, and a file_path. It then selects all the paragraphs using the soup object, scrapes QAs from the questions paragraphs and saves the results to file_path.
- run: takes a playwright instance. Think of it as our main function. Opens a browser to the url, scrapes the content of the first page, loops through all the links on the page and scrapes their content as well (by calling the get_questions_from_page function)
3. Code walkthrough.
1. get_questions_from_page
def get_questions_from_page(page, url, topic="Basics"):
page.goto(url, timeout=0)
try:
# create a soup of the page's content
soup = BeautifulSoup(page.content(), "html.parser")
file_path = os.path.join(questions_directory_path, f"{topic}.txt")
# get the questions and save to file_path
get_questions_from_soup(soup, topic, file_path=file_path)
# get the questions and save to a text file (questions.txt)
except Exception as e:
print("Error in get_questions_from_page")
The soup created here allows us to access HTML elements on the page (can also work with xml)
2. get_questions_from_soup
def get_questions_from_soup(soup, topic="Basics", file_path="questions.txt"):
paragraphs = soup.select(".entry-content>p")
answers = soup.select(".entry-content>.collapseomatic_content")
code_blocks = soup.select(".entry-content .hk1_style pre")
paragraph_index, answer_index, code_block_index = 0, 0, 0
"""
Not all paragraphs on the page are questions
some questions span multiple paragraphs and not every question
has a code block
So we need to use the above variables to keep track of where we're at
and increment as we advance (see code below to understand
better)
"""
# list to keep track of all the QAs scraped
texts = []
"""
We're using answers as our condition because it is the only
constant thing.
Each question has one and only one answer element so we can
reliably use the answers to know which question number we're on
"""
while answer_index < len(answers):
paragraph = paragraphs[paragraph_index].text.replace('\r', '')
paragraph_index += 1
if not question_regex.search(paragraph):
print(f"paragraph: {paragraph} did not match test, continuing")
continue
# getting the answer for that particular question
answer = answers[answer_index].text.replace('\r', '')
answer_index += 1
text = f"({topic}) {paragraph}"
"""
The questions with their answers in them have at least 2 lines
1 for the question, a line for each answer and a line for the View Answer button
The questions with code blocks have just one paragraph for the question itself.
The other attributes (code and answers) are stored in different paragraphs
"""
if len(paragraph.split('\n')) >= 2:
print(f"The question has the options in it, setting text to the question")
paragraph_text = paragraph.strip('View Answer')
text = f"({topic}) {paragraph_text}\n\n{answer}"
else:
print(f"This question has a code block, code_blocks_index: {code_block_index}")
answer_paragraph = paragraphs[paragraph_index].text.strip('View Answer').replace('\r', '') # this is now the paragraph with the answers
"""
Remember that the paragraph_index was incremented after the initial paragraph was stored in a variable.
The paragraph_index is now pointing to an answer paragraph.
"""
try:
code_block = code_blocks[code_block_index].text
except IndexError as e:
raise e
print("This question has a code block, the block is: ")
print(code_block)
text = f"{text}\n\n{code_block}\n\n{answer_paragraph}\n\n{answer}"
code_block_index += 1
paragraph_index += 1
texts.append(text)
# writing the texts list to the file_path passed as an argument
with open(file_path, "w", encoding="utf-8") as file:
file.write(
'\n\n\n\n'.join(texts)
)
3. run
def run(playwright: Playwright) -> None:
try:
# set headless to True if you want this script to run
# without opening the browser.
browser = playwright.chromium.launch(headless=False)
context = browser.new_context()
page = context.new_page()
page.goto(url, timeout=0)
# Scrape the questions from the index page
soup = BeautifulSoup(page.content(), "html.parser")
basics_file_name = os.path.join(questions_directory_path, "Basics.txt")
get_questions_from_soup(soup, file_path=basics_file_name)
# Get the links to the other topics' questions
links = soup.select(".sf-2col-tbl li a")
number_of_links = len(links)
for index, link in enumerate(links):
print(f"Getting questions from link #{index+1} of {number_of_links}")
text = link.text
if text in already_downloaded_questions:
print(f"Already downloaded the {text} questions, continuing")
continue
href = link.get("href") # get the url from the link
# get the questions from the page
get_questions_from_page(page, href, topic=text)
context.close()
browser.close()
except Exception as e:
breakpoint()
raise e
NB: Add the following code to the end of the file to call the run function when the file is run:
if __name__ == "__main__":
with sync_playwright() as playwright:
run(playwright)
4. Final code
import re
import os
from playwright.sync_api import Playwright, sync_playwright, expect
from bs4 import BeautifulSoup
question_regex = re.compile("^(\d+\.)((\s*\w*\s*)*) (?<![\r])")
questions_directory_path = os.path.join(
os.path.dirname(__file__), "questions"
)
if not os.path.exists(questions_directory_path):
os.makedirs(questions_directory_path)
already_downloaded_questions = set([
f.strip(".txt") for f in os.listdir(questions_directory_path)
])
url = "https://www.sanfoundry.com/1000-python-questions-answers/"
def get_questions_from_page(page, url, topic="Basics"):
page.goto(url, timeout=0)
try:
soup = BeautifulSoup(page.content(), "html.parser")
file_path = os.path.join(questions_directory_path, f"{topic}.txt")
# get the questions and save to a text file at file_path
get_questions_from_soup(soup, topic, file_path=file_path)
except Exception as e:
print("Error in get_questions_from_page")
def get_questions_from_soup(soup, topic="Basics", file_path="questions.txt"):
paragraphs = soup.select(".entry-content>p")
answers = soup.select(".entry-content>.collapseomatic_content")
code_blocks = soup.select(".entry-content .hk1_style pre")
paragraph_index, answer_index, code_block_index = 0, 0, 0
"""
Not all paragraphs on the page are questions
some questions span multiple paragraphs and not every question has a code block
So we need to use the above variables to keep track of where we're at
and increment as we advance (see code below to understand better)
"""
texts = []
while answer_index < len(answers):
paragraph = paragraphs[paragraph_index].text.replace('\r', '')
paragraph_index += 1
if not question_regex.search(paragraph):
print(f"paragraph: {paragraph} did not match test, continuing")
continue
answer = answers[answer_index].text.replace('\r', '')
answer_index += 1
text = f"({topic}) {paragraph}"
if len(paragraph.split('\n')) >= 2:
print(f"The question has the options in it, setting text to the question")
text = f"{paragraph.strip('View Answer')}\n\n{answer}"
else:
print(f"This question has a code block, code_blocks_index: {code_block_index}")
# The paragraph is a code block
answer_paragraph = paragraphs[paragraph_index].text.strip('View Answer').replace('\r', '') # this is now the paragraph with the answers
try:
code_block = code_blocks[code_block_index].text
except IndexError as e:
raise e
print("This question has a code block, the block is: ")
print(code_block)
text = f"{text}\n\n{code_block}\n\n{answer_paragraph}\n\n{answer}"
code_block_index += 1
paragraph_index += 1
texts.append(text)
with open(file_path, "w", encoding="utf-8") as file:
file.write(
'\n\n\n\n'.join(texts)
)
def run(playwright: Playwright) -> None:
try:
browser = playwright.chromium.launch(headless=False)
context = browser.new_context()
page = context.new_page()
page.goto(url, timeout=0)
soup = BeautifulSoup(page.content(), "html.parser")
basics_file_name = os.path.join(questions_directory_path, "Basics.txt")
get_questions_from_soup(soup, file_path=basics_file_name)
links = soup.select(".sf-2col-tbl li a")
number_of_links = len(links)
for index, link in enumerate(links):
print(f"Getting questions from link #{index+1} of {number_of_links}")
text = link.text
if text in already_downloaded_questions:
print(f"Already downloaded the {text} questions, continuing")
continue
href = link.get("href")
get_questions_from_page(page, href, topic=text)
context.close()
browser.close()
except Exception as e:
breakpoint()
raise e
if __name__ == "__main__":
with sync_playwright() as playwright:
run(playwright)
We hope this article has been informative, please in case of any issues or suggestions feel free to contact me.
Top comments (0)