This commit is contained in:
2025-12-23 23:56:13 +03:00
parent 2cb30642cf
commit cb96b33e90
3 changed files with 160 additions and 0 deletions

0
README Normal file
View File

93
scraper.go Normal file
View File

@@ -0,0 +1,93 @@
package parser
import (
"strings"
"golang.org/x/net/html"
)
type crawlFunc func(*html.Node)
func searchElem(n *html.Node, data string) chan *html.Node {
ch := make(chan *html.Node)
var crawl crawlFunc
crawl = func(n *html.Node) {
if n.Type == html.ElementNode && n.Data == data {
ch <- n
}
c := n.FirstChild
for c != nil {
crawl(c)
c = c.NextSibling
}
}
go func() {
defer close(ch)
crawl(n)
}()
return ch
}
func searchAttr(n *html.Node, key, contains string) chan *html.Node {
ch := make(chan *html.Node)
var crawl crawlFunc
crawl = func(n *html.Node) {
for _, a := range n.Attr {
if a.Key == key &&
containsWord(a.Val, contains) {
ch <- n
}
}
c := n.FirstChild
for c != nil {
crawl(c)
c = c.NextSibling
}
}
go func() {
defer close(ch)
crawl(n)
}()
return ch
}
func searchElemAttr(n *html.Node, elem, key, value string) chan *html.Node {
ch := make(chan *html.Node)
go func() {
defer close(ch)
for e := range searchElem(n, elem) {
// If document is too large there are
// would be a hundreds of goroutines :((
for attr := range searchAttr(e, key, value) {
ch <- attr
}
}
}()
return ch
}
func crawlText(n *html.Node) string {
var s = new(strings.Builder)
var crawl crawlFunc
crawl = func(n *html.Node) {
if n.Type == html.TextNode {
Must(
s.WriteString(func() string {
trimmed := strings.TrimSpace(n.Data)
if n.Data == "" {
return trimmed
}
return trimmed+"\n"
}()),
)
}
c := n.FirstChild
for c != nil {
crawl(c)
c = c.NextSibling
}
}
crawl(n)
return s.String()
}

67
scraper_test.go Normal file
View File

@@ -0,0 +1,67 @@
package parser
import (
"fmt"
"strings"
"testing"
"golang.org/x/net/html"
)
const htmlStr = `<div><span id="test">Hello</span><span id="test">World</span></div>`
func TestSearchElem(t *testing.T) {
doc := Must(
html.Parse(strings.NewReader(htmlStr)),
)
ch := searchElem(doc, "span")
count := 0
for range ch {
count++
}
if count != 2 {
t.Errorf("Expected: 2 span elements, got: %d", count)
}
}
func TestSearchAttr(t *testing.T) {
doc := Must(
html.Parse(strings.NewReader(htmlStr)),
)
ch := searchAttr(doc, "id", "test")
count := 0
for range ch {
count++
}
if count != 2 {
t.Errorf("Expected: 2 span elements with id 'test', got: %d", count)
}
}
func TestSearchElemAttr(t *testing.T) {
doc := Must(
html.Parse(strings.NewReader(htmlStr)),
)
ch := searchElemAttr(doc, "span", "id", "test")
count := 0
for range ch {
count++
}
if count != 2 {
t.Errorf("Expected: 2 span elements with id 'test', got: %d", count)
}
}
func TestCrawlTest(t *testing.T) {
fmt.Println(
crawlText(Must(
html.Parse(strings.NewReader(htmlStr)),
)),
)
}