From 12b9af4c0156e66a4e77b4f10c4c5ca583fc5085 Mon Sep 17 00:00:00 2001 From: potassium Date: Wed, 11 Feb 2026 23:49:58 +0300 Subject: [PATCH] searchelemattr --- README | 4 +++- match.go | 41 ++++++++++++++++++++++++++++++++++++++--- 2 files changed, 41 insertions(+), 4 deletions(-) diff --git a/README b/README index 8129941..ed0b17e 100644 --- a/README +++ b/README @@ -1,3 +1,5 @@ Basic HTML scraper -Was done long-long time ago. I'm surprised how I managed to force myself to make it work. \ No newline at end of file +Was done long-long time ago. I'm surprised how I managed to force myself to make it work. + +TODO: add context, to prevent endless crawling diff --git a/match.go b/match.go index bc570bb..0b0c601 100644 --- a/match.go +++ b/match.go @@ -17,13 +17,19 @@ var body = &html.Node{ var ErrNotAnElementNode = errors.New("not an ElementNode") +func parseFragment(s string) (*html.Node, error) { + n, err := html.ParseFragment(strings.NewReader(s), body) + if err != nil { + return n, err + } + return n[0], nil +} + func MatchElemAttr(s string, n2 *html.Node) (bool, error) { - r := strings.NewReader(s) - n, err := html.ParseFragment(r, body) + n1, err := parseFragemnt(s) if err != nil { return false, err } - n1 := n[0] if !(n1.Type == html.ElementNode && n2.Type == html.ElementNode) { @@ -36,3 +42,32 @@ func MatchElemAttr(s string, n2 *html.Node) (bool, error) { } return false, nil } + + +func SearchElemAttr2(s string, n2 *html.Node) (chan *html.Node, error) { + ch := make(chan *html.Node) + var crawl func(*html.Node) + crawl = func(n *html.Node) { + matches, err := MatchElemAttr(s, n) + if err != nil { + panic(err) + } + if matches { + ch <- n + } + c := n.FirstChild + for c != nil { + crawl(c) + c = c.NextSibling + } + } + go func() { + defer close(ch) + crawl(n2) + }() + + if r := recover(); r != nil { + return nil, r + } + return <-ch, nil +}