Scraping with Go
Basic of HTML Elements
1. Search for tags
In case of tags we just have to write like this for any html tag because in golang single quote represent runes
so brackets under double quotes " "
for anchor tag
(".a")
for paragraph Tag
(".p")
same for other tags....
2. Search for all div attributes
id considered as attribute associated with div tag
- example
("div[id]")
general
("html-tag[html-attribute]")
3. Search for all name attributes
For Example ("div[id=comment]")
comment is name of id attribute so fo search all related name attributes we have to write like this
("#comment")
4. Search for all elements based on class
For Example ("div class=writer")
(".writer")
5. Search for all elements have set same attribute set
("*[html-attribute]")
Scraping in Golang Using Colly and exporting into CSV File
Example - 1
package main
import (
"encoding/csv"
"fmt"
"log"
"os"
"time"
"github.com/gocolly/colly"
)
// Its our data model based on this we will specify the elements we will scrap
type Quotes struct {
AUTHOR string
QUOTE string
}
func QuoteScrapper() {
// url of the website that we want to scrap
var url string = "https://www.brainyquote.com/top_100_quotes"
// file name of our csv file - yu can give it anything you want
var fileName string = "quote.csv"
fmt.Println("Starting Scraping....")
// using os library we will create a csv file in our directory
file, err := os.Create(fileName)
if err != nil {
log.Fatal("Panic: Could not be able to Create file", fileName, err)
return
}
defer file.Close()
// writer will write the context of the file
writer := csv.NewWriter(file)
defer writer.Flush()
// first two heading of the CSV file
writer.Write([]string{"Author", "Quote"})
// Colly - Initializing our collector
c := colly.NewCollector()
c.SetRequestTimeout(120 * time.Second)
c.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting:", r.URL)
})
c.OnResponse(func(r *colly.Response) {
fmt.Println("Got a response from", r.Request.URL)
})
c.OnError(func(r *colly.Response, e error) {
fmt.Println("Got this error:", e)
})
c.OnHTML(".quoteContent", func(h *colly.HTMLElement) {
quote := &Quotes{}
quote.AUTHOR = h.ChildText(".bq_fq_a")
quote.QUOTE = h.ChildText(".b-qt-qt")
writer.Write([]string{quote.AUTHOR, quote.QUOTE})
})
c.Visit(url)
fmt.Println("End of Era: ", url)
}
Example - 2
package main
import (
"encoding/csv"
"fmt"
"log"
"os"
"time"
"github.com/gocolly/colly"
)
type PRODUCTS struct {
Name string
Image string
Price string
Url string
Discount string
}
func StoreScrapper() {
c := colly.NewCollector()
c.SetRequestTimeout(120 * time.Second)
var fileName string = "products.csv"
fmt.Println("Starting Scraping....")
file, err := os.Create(fileName)
if err != nil {
log.Fatal("Panic: Could not be able to Create file", fileName, err)
return
}
defer file.Close()
writer := csv.NewWriter(file)
defer writer.Flush()
writer.Write([]string{"Name", "Quote"})
// Callbacks
c.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting", r.URL)
})
c.OnResponse(func(r *colly.Response) {
fmt.Println("Got a response from", r.Request.URL)
})
c.OnError(func(r *colly.Response, e error) {
fmt.Println("Got this error:", e, r.StatusCode)
})
c.OnHTML(".core", func(e *colly.HTMLElement) {
e.ForEach(".name", func(_ int, h *colly.HTMLElement) {
item := &PRODUCTS{}
item.Name = h.Text
// item.Image = e.ChildAttr(".img-C", ".data-src")
item.Price = e.ChildText(".data-price")
item.Url = "https://jumia.com.ng" + e.Attr(".href")
// item.Discount = e.ChildText(".div.tag._dsct")
writer.Write([]string{item.Name, item.Price, item.Url})
})
})
c.Visit("https://www.jumia.com.ng/flash-sales/")
}
Scraping in Golang Using Colly and exporting into JSON File
package main
import (
"encoding/json"
"fmt"
"os"
"github.com/gocolly/colly"
)
type NEWS struct {
TITLE string `json:"title"`
LINKS string `json:"links"`
DATE string `json:"date"`
}
func NewsCrawlerServer() {
var url string = "https://www.thenews.com.pk/latest-stories"
fmt.Println("Starting Scraping....")
collector := colly.NewCollector()
var data []NEWS
collector.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting:", r.URL)
})
collector.OnResponse(func(r *colly.Response) {
fmt.Println("Got a response from", r.Request.URL)
})
collector.OnError(func(r *colly.Response, e error) {
fmt.Println("Got this error:", e)
})
collector.OnHTML(".writter-list-item-story", func(element *colly.HTMLElement) {
news := &NEWS{}
element.ForEach(".latest-right", func(_ int, h *colly.HTMLElement) {
news.TITLE = h.ChildText(".open-section")
news.LINKS = h.ChildAttr(".open-section", "href")
news.DATE = h.ChildText(".latestDate")
data = append(data, *news)
})
})
collector.Visit(url)
content, err := json.Marshal(data)
if err != nil {
fmt.Println(err.Error())
}
os.WriteFile("news.json", content, 0644)
fmt.Println("NEWS ", len(data))
}
Scraping Table Data in Golang Using colly and Exporting into CSV
package main
import (
"encoding/csv"
"fmt"
"log"
"os"
"github.com/gocolly/colly"
)
type PSX struct {
LDCP string
SCRIP string
OPEN string
HIGH string
LOW string
CURRENT string
VOLUME string
CHANGE string
}
func StockTableCrawler() {
fName := "data.csv"
file, err := os.Create(fName)
if err != nil {
log.Fatalf("Could not create file, err: %q", err)
return
}
defer file.Close()
writer := csv.NewWriter(file)
defer writer.Flush()
var _url string = "https://www.urdupoint.com/english/"
// var _fileName string = "psx.json"
fmt.Println("Service Started....")
collector := colly.NewCollector()
collector.OnRequest(onRequest)
collector.OnResponse(onResponse)
collector.OnError(onError)
collector.OnHTML(".table-responsive", func(e *colly.HTMLElement) {
e.ForEach("tr", func(_ int, eh *colly.HTMLElement) {
psxData := PSX{
SCRIP: eh.ChildText("td:nth-child(1)"),
LDCP: eh.ChildText("td:nth-child(2)"),
OPEN: eh.ChildText("td:nth-child(3)"),
HIGH: eh.ChildText("td:nth-child(4)"),
LOW: eh.ChildText("td:nth-child(5)"),
CURRENT: eh.ChildText("td:nth-child(6)"),
CHANGE: eh.ChildText("td:nth-child(7)"),
VOLUME: eh.ChildText("td:nth-child(8)"),
}
writer.Write([]string{
psxData.SCRIP,
psxData.LDCP,
psxData.OPEN,
psxData.HIGH,
psxData.LOW,
psxData.CURRENT,
psxData.CHANGE,
psxData.VOLUME,
})
})
fmt.Println("Scrapping Completed")
})
// collector.OnHTML(".table-responsive", onHTML)
fmt.Println("Scrapping Completed")
collector.Visit(_url)
}
// on Request
func onRequest(r *colly.Request) {
fmt.Println("Scraping:", r.URL)
}
// on Response
func onResponse(r *colly.Response) {
fmt.Println("Status:", r.StatusCode)
}
// on ERROR
func onError(r *colly.Response, err error) {
fmt.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err)
}
Scraping Data of Multiple Pages in Golang Using colly and Exporting into CSV
package main
import (
"encoding/csv"
"fmt"
"log"
"os"
"github.com/gocolly/colly"
)
type Book struct {
Title string
Price string
}
func Crawling() {
Request()
Response()
HTML()
NextPageHTML()
Visiting()
}
func Data(data []string) {
file, err := os.Create("export.csv")
if err != nil {
log.Fatal(err)
}
defer file.Close()
writer := csv.NewWriter(file)
defer writer.Flush()
headers := []string{"TITLE", "PRICE"}
writer.Write(headers)
writer.Write(data)
}
var collector *colly.Collector = colly.NewCollector(
colly.AllowedDomains("books.toscrape.com"),
)
func requesting(r *colly.Request) {
fmt.Println("Visiting: ", r.URL)
}
func Request() {
collector.OnRequest(requesting)
}
// responding
func responding(r *colly.Response) {
fmt.Println("Response: ", r.StatusCode)
}
func Response() {
collector.OnResponse(responding)
}
func htmlElement(e *colly.HTMLElement) {
book := &Book{}
book.Title = e.ChildAttr(".image_container img", "alt")
book.Price = e.ChildText(".price_color")
row := []string{book.Title, book.Price}
Data(row)
}
func HTML() {
collector.OnHTML(".product_pod", htmlElement)
// collector.OnHTML(".next > a", pagination)
}
func pagination(e *colly.HTMLElement) {
nextPage := e.Request.AbsoluteURL(e.Attr("href"))
collector.Visit(nextPage)
}
func NextPageHTML() {
collector.OnHTML(".next > a", pagination)
}
func Visiting() {
collector.Visit("https://books.toscrape.com/")
}
Top comments (0)