searchelemattr

This commit is contained in:
2026-02-11 23:49:58 +03:00
parent 55a6a4d3ef
commit 12b9af4c01
2 changed files with 41 additions and 4 deletions

4
README
View File

@@ -1,3 +1,5 @@
Basic HTML scraper Basic HTML scraper
Was done long-long time ago. I'm surprised how I managed to force myself to make it work. Was done long-long time ago. I'm surprised how I managed to force myself to make it work.
TODO: add context, to prevent endless crawling

View File

@@ -17,13 +17,19 @@ var body = &html.Node{
var ErrNotAnElementNode = errors.New("not an ElementNode") var ErrNotAnElementNode = errors.New("not an ElementNode")
func parseFragment(s string) (*html.Node, error) {
n, err := html.ParseFragment(strings.NewReader(s), body)
if err != nil {
return n, err
}
return n[0], nil
}
func MatchElemAttr(s string, n2 *html.Node) (bool, error) { func MatchElemAttr(s string, n2 *html.Node) (bool, error) {
r := strings.NewReader(s) n1, err := parseFragemnt(s)
n, err := html.ParseFragment(r, body)
if err != nil { if err != nil {
return false, err return false, err
} }
n1 := n[0]
if !(n1.Type == html.ElementNode && if !(n1.Type == html.ElementNode &&
n2.Type == html.ElementNode) { n2.Type == html.ElementNode) {
@@ -36,3 +42,32 @@ func MatchElemAttr(s string, n2 *html.Node) (bool, error) {
} }
return false, nil return false, nil
} }
func SearchElemAttr2(s string, n2 *html.Node) (chan *html.Node, error) {
ch := make(chan *html.Node)
var crawl func(*html.Node)
crawl = func(n *html.Node) {
matches, err := MatchElemAttr(s, n)
if err != nil {
panic(err)
}
if matches {
ch <- n
}
c := n.FirstChild
for c != nil {
crawl(c)
c = c.NextSibling
}
}
go func() {
defer close(ch)
crawl(n2)
}()
if r := recover(); r != nil {
return nil, r
}
return <-ch, nil
}