94 lines
1.6 KiB
Go
94 lines
1.6 KiB
Go
package scraper
|
|
|
|
import (
|
|
"strings"
|
|
|
|
"golang.org/x/net/html"
|
|
)
|
|
|
|
type crawlFunc func(*html.Node)
|
|
|
|
func SearchElem(n *html.Node, data string) chan *html.Node {
|
|
ch := make(chan *html.Node)
|
|
var crawl crawlFunc
|
|
crawl = func(n *html.Node) {
|
|
if n.Type == html.ElementNode && n.Data == data {
|
|
ch <- n
|
|
}
|
|
c := n.FirstChild
|
|
for c != nil {
|
|
crawl(c)
|
|
c = c.NextSibling
|
|
}
|
|
}
|
|
go func() {
|
|
defer close(ch)
|
|
crawl(n)
|
|
}()
|
|
|
|
return ch
|
|
}
|
|
|
|
func SearchAttr(n *html.Node, key, contains string) chan *html.Node {
|
|
ch := make(chan *html.Node)
|
|
var crawl crawlFunc
|
|
crawl = func(n *html.Node) {
|
|
for _, a := range n.Attr {
|
|
if a.Key == key &&
|
|
containsWord(a.Val, contains) {
|
|
ch <- n
|
|
}
|
|
}
|
|
c := n.FirstChild
|
|
for c != nil {
|
|
crawl(c)
|
|
c = c.NextSibling
|
|
}
|
|
}
|
|
go func() {
|
|
defer close(ch)
|
|
crawl(n)
|
|
}()
|
|
return ch
|
|
}
|
|
|
|
func SearchElemAttr(n *html.Node, elem, key, value string) chan *html.Node {
|
|
ch := make(chan *html.Node)
|
|
go func() {
|
|
defer close(ch)
|
|
for e := range SearchElem(n, elem) {
|
|
// If document is too large there are
|
|
// would be a hundreds of goroutines :((
|
|
for attr := range SearchAttr(e, key, value) {
|
|
ch <- attr
|
|
}
|
|
}
|
|
}()
|
|
return ch
|
|
}
|
|
|
|
func CrawlText(n *html.Node) string {
|
|
var s = new(strings.Builder)
|
|
var crawl crawlFunc
|
|
crawl = func(n *html.Node) {
|
|
if n.Type == html.TextNode {
|
|
Must(
|
|
s.WriteString(func() string {
|
|
trimmed := strings.TrimSpace(n.Data)
|
|
if n.Data == "" {
|
|
return trimmed
|
|
}
|
|
return trimmed+"\n"
|
|
}()),
|
|
)
|
|
}
|
|
c := n.FirstChild
|
|
for c != nil {
|
|
crawl(c)
|
|
c = c.NextSibling
|
|
}
|
|
}
|
|
crawl(n)
|
|
return s.String()
|
|
}
|