searchelemattr
This commit is contained in:
2
README
2
README
@@ -1,3 +1,5 @@
|
||||
Basic HTML scraper
|
||||
|
||||
Was done long-long time ago. I'm surprised how I managed to force myself to make it work.
|
||||
|
||||
TODO: add context, to prevent endless crawling
|
||||
|
||||
41
match.go
41
match.go
@@ -17,13 +17,19 @@ var body = &html.Node{
|
||||
|
||||
var ErrNotAnElementNode = errors.New("not an ElementNode")
|
||||
|
||||
func parseFragment(s string) (*html.Node, error) {
|
||||
n, err := html.ParseFragment(strings.NewReader(s), body)
|
||||
if err != nil {
|
||||
return n, err
|
||||
}
|
||||
return n[0], nil
|
||||
}
|
||||
|
||||
func MatchElemAttr(s string, n2 *html.Node) (bool, error) {
|
||||
r := strings.NewReader(s)
|
||||
n, err := html.ParseFragment(r, body)
|
||||
n1, err := parseFragemnt(s)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
n1 := n[0]
|
||||
|
||||
if !(n1.Type == html.ElementNode &&
|
||||
n2.Type == html.ElementNode) {
|
||||
@@ -36,3 +42,32 @@ func MatchElemAttr(s string, n2 *html.Node) (bool, error) {
|
||||
}
|
||||
return false, nil
|
||||
}
|
||||
|
||||
|
||||
func SearchElemAttr2(s string, n2 *html.Node) (chan *html.Node, error) {
|
||||
ch := make(chan *html.Node)
|
||||
var crawl func(*html.Node)
|
||||
crawl = func(n *html.Node) {
|
||||
matches, err := MatchElemAttr(s, n)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
if matches {
|
||||
ch <- n
|
||||
}
|
||||
c := n.FirstChild
|
||||
for c != nil {
|
||||
crawl(c)
|
||||
c = c.NextSibling
|
||||
}
|
||||
}
|
||||
go func() {
|
||||
defer close(ch)
|
||||
crawl(n2)
|
||||
}()
|
||||
|
||||
if r := recover(); r != nil {
|
||||
return nil, r
|
||||
}
|
||||
return <-ch, nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user