searchelemattr
This commit is contained in:
4
README
4
README
@@ -1,3 +1,5 @@
|
|||||||
Basic HTML scraper
|
Basic HTML scraper
|
||||||
|
|
||||||
Was done long-long time ago. I'm surprised how I managed to force myself to make it work.
|
Was done long-long time ago. I'm surprised how I managed to force myself to make it work.
|
||||||
|
|
||||||
|
TODO: add context, to prevent endless crawling
|
||||||
|
|||||||
41
match.go
41
match.go
@@ -17,13 +17,19 @@ var body = &html.Node{
|
|||||||
|
|
||||||
var ErrNotAnElementNode = errors.New("not an ElementNode")
|
var ErrNotAnElementNode = errors.New("not an ElementNode")
|
||||||
|
|
||||||
|
func parseFragment(s string) (*html.Node, error) {
|
||||||
|
n, err := html.ParseFragment(strings.NewReader(s), body)
|
||||||
|
if err != nil {
|
||||||
|
return n, err
|
||||||
|
}
|
||||||
|
return n[0], nil
|
||||||
|
}
|
||||||
|
|
||||||
func MatchElemAttr(s string, n2 *html.Node) (bool, error) {
|
func MatchElemAttr(s string, n2 *html.Node) (bool, error) {
|
||||||
r := strings.NewReader(s)
|
n1, err := parseFragemnt(s)
|
||||||
n, err := html.ParseFragment(r, body)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return false, err
|
return false, err
|
||||||
}
|
}
|
||||||
n1 := n[0]
|
|
||||||
|
|
||||||
if !(n1.Type == html.ElementNode &&
|
if !(n1.Type == html.ElementNode &&
|
||||||
n2.Type == html.ElementNode) {
|
n2.Type == html.ElementNode) {
|
||||||
@@ -36,3 +42,32 @@ func MatchElemAttr(s string, n2 *html.Node) (bool, error) {
|
|||||||
}
|
}
|
||||||
return false, nil
|
return false, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
func SearchElemAttr2(s string, n2 *html.Node) (chan *html.Node, error) {
|
||||||
|
ch := make(chan *html.Node)
|
||||||
|
var crawl func(*html.Node)
|
||||||
|
crawl = func(n *html.Node) {
|
||||||
|
matches, err := MatchElemAttr(s, n)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
if matches {
|
||||||
|
ch <- n
|
||||||
|
}
|
||||||
|
c := n.FirstChild
|
||||||
|
for c != nil {
|
||||||
|
crawl(c)
|
||||||
|
c = c.NextSibling
|
||||||
|
}
|
||||||
|
}
|
||||||
|
go func() {
|
||||||
|
defer close(ch)
|
||||||
|
crawl(n2)
|
||||||
|
}()
|
||||||
|
|
||||||
|
if r := recover(); r != nil {
|
||||||
|
return nil, r
|
||||||
|
}
|
||||||
|
return <-ch, nil
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user