From cb96b33e902040c0f23a0e68e876f87052ce4b28 Mon Sep 17 00:00:00 2001 From: potassium Date: Tue, 23 Dec 2025 23:56:13 +0300 Subject: [PATCH] initial --- README | 0 scraper.go | 93 +++++++++++++++++++++++++++++++++++++++++++++++++ scraper_test.go | 67 +++++++++++++++++++++++++++++++++++ 3 files changed, 160 insertions(+) create mode 100644 README create mode 100644 scraper.go create mode 100644 scraper_test.go diff --git a/README b/README new file mode 100644 index 0000000..e69de29 diff --git a/scraper.go b/scraper.go new file mode 100644 index 0000000..b839311 --- /dev/null +++ b/scraper.go @@ -0,0 +1,93 @@ +package parser + +import ( + "strings" + + "golang.org/x/net/html" +) + +type crawlFunc func(*html.Node) + +func searchElem(n *html.Node, data string) chan *html.Node { + ch := make(chan *html.Node) + var crawl crawlFunc + crawl = func(n *html.Node) { + if n.Type == html.ElementNode && n.Data == data { + ch <- n + } + c := n.FirstChild + for c != nil { + crawl(c) + c = c.NextSibling + } + } + go func() { + defer close(ch) + crawl(n) + }() + + return ch +} + +func searchAttr(n *html.Node, key, contains string) chan *html.Node { + ch := make(chan *html.Node) + var crawl crawlFunc + crawl = func(n *html.Node) { + for _, a := range n.Attr { + if a.Key == key && + containsWord(a.Val, contains) { + ch <- n + } + } + c := n.FirstChild + for c != nil { + crawl(c) + c = c.NextSibling + } + } + go func() { + defer close(ch) + crawl(n) + }() + return ch +} + +func searchElemAttr(n *html.Node, elem, key, value string) chan *html.Node { + ch := make(chan *html.Node) + go func() { + defer close(ch) + for e := range searchElem(n, elem) { + // If document is too large there are + // would be a hundreds of goroutines :(( + for attr := range searchAttr(e, key, value) { + ch <- attr + } + } + }() + return ch +} + +func crawlText(n *html.Node) string { + var s = new(strings.Builder) + var crawl crawlFunc + crawl = func(n *html.Node) { + if n.Type == html.TextNode { + Must( + s.WriteString(func() string { + trimmed := strings.TrimSpace(n.Data) + if n.Data == "" { + return trimmed + } + return trimmed+"\n" + }()), + ) + } + c := n.FirstChild + for c != nil { + crawl(c) + c = c.NextSibling + } + } + crawl(n) + return s.String() +} diff --git a/scraper_test.go b/scraper_test.go new file mode 100644 index 0000000..42d0221 --- /dev/null +++ b/scraper_test.go @@ -0,0 +1,67 @@ +package parser + +import ( + "fmt" + "strings" + "testing" + + "golang.org/x/net/html" +) + +const htmlStr = `
HelloWorld
` + +func TestSearchElem(t *testing.T) { + doc := Must( + html.Parse(strings.NewReader(htmlStr)), + ) + + ch := searchElem(doc, "span") + count := 0 + for range ch { + count++ + } + + if count != 2 { + t.Errorf("Expected: 2 span elements, got: %d", count) + } +} + +func TestSearchAttr(t *testing.T) { + doc := Must( + html.Parse(strings.NewReader(htmlStr)), + ) + + ch := searchAttr(doc, "id", "test") + count := 0 + for range ch { + count++ + } + + if count != 2 { + t.Errorf("Expected: 2 span elements with id 'test', got: %d", count) + } +} + +func TestSearchElemAttr(t *testing.T) { + doc := Must( + html.Parse(strings.NewReader(htmlStr)), + ) + + ch := searchElemAttr(doc, "span", "id", "test") + count := 0 + for range ch { + count++ + } + + if count != 2 { + t.Errorf("Expected: 2 span elements with id 'test', got: %d", count) + } +} + +func TestCrawlTest(t *testing.T) { + fmt.Println( + crawlText(Must( + html.Parse(strings.NewReader(htmlStr)), + )), + ) +}