import json
import requests
from bs4 import BeautifulSoup
import csv
import numpy
import pandas as pd
import string
import asyncio
import aiohttp
import time
import csv
from lxml import etree
async def validate_route_page():
with open('unprocessed_data.csv', 'r') as file:
reader = csv.reader(file)
route_data = []
route_temp = []
i = 1
for idx, row in enumerate(reader):
route_temp.append(row[:6])
if not (idx + 1) % 10:
route_data.append(route_temp)
route_temp = []
print(i / 10000)
i = i + 1
route_data.append(route_temp)
with open('processed_data.csv', 'w+', newline='') as file2:
writer = csv.writer(file2)
writer.writerow(["serial_no", "url", "url_status", "page_rover_url", "page_rover_status"])
async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:
for rd in route_data:
tasks = [scrap_data(j, session) for j in rd]
result_data = await asyncio.gather(*tasks)
time.sleep(5)
for data in result_data:
writer.writerow(data)
async def scrap_data(row, session):
url = row[3]
src = row[1]
dest = row[2]
page_rover_url = row[4]
id = row[0]
try:
async with session.get(url, timeout=30) as res:
if not res.status == 200:
return [src, dest, 'timeout']
data = res.content.read_nowait()
except Exception as e:
data = [src, dest, 'Exception Found']
print(src, dest, 'timeout')
return data
mainSoup = BeautifulSoup(data, 'html.parser')
dom = etree.HTML(str(mainSoup))
a1 = dom.xpath('//div[3]/h2').__len__() # popular trains from src1 to dest1
a2 = dom.xpath('//main/div[3]/p').__len__() # 5found from
a3 = dom.xpath('//div[4]/h2').__len__() # covid special trains from
a4 = dom.xpath('//div[5]/h2').__len__() # other trains fron src to dest
if a1 >= 1 or a2 >= 1 or a3 >= 1 or a4 >= 1:
url_status = True
else:
url_status = False
################################
try:
async with session.get(page_rover_url + "&format=json", timeout=10) as res:
if not res.status == 200:
return [src, dest, 'timeout']
data = res.content.read_nowait()
except Exception as e:
data = [src, dest, 'error']
print(src, dest, 'timeout')
return data
try:
response_info = json.loads(data)
q_filter = response_info['data']['q_filter'].__len__()
if q_filter < 1:
q_filter = False
else:
q_filter = True
except Exception as e:
q_filter = 'Exception Found'
page_rover_status = q_filter
if data:
data = [id, url, url_status, page_rover_url, page_rover_status]
print(data)
else:
data = [id, url, url_status, page_rover_url, page_rover_status]
print(data)
return data
if name == "main":
asyncio.run(validate_route_page())
Top comments (0)