Compare commits

...

3 Commits

Author SHA1 Message Date
55a6a4d3ef MatchElemAttr func implementation 2026-01-16 01:13:53 +03:00
78985687a1 go.mod 2026-01-04 13:25:59 +03:00
5cb8665a08 exported 2025-12-27 09:07:39 +03:00
6 changed files with 73 additions and 16 deletions

5
go.mod
View File

@@ -1,5 +0,0 @@
module git.nkpl.cc/XoxJlopeZi4BB/scraper.git
go 1.25.5
require golang.org/x/net v0.48.0

2
go.sum
View File

@@ -1,2 +0,0 @@
golang.org/x/net v0.48.0 h1:zyQRTTrjc33Lhh0fBgT/H3oZq9WuvRR5gPC70xpDiQU=
golang.org/x/net v0.48.0/go.mod h1:+ndRgGjkh8FGtu1w1FGbEC31if4VrNVMuKTgcAAnQRY=

38
match.go Normal file
View File

@@ -0,0 +1,38 @@
package scraper
import (
"errors"
"slices"
"strings"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
)
var body = &html.Node{
Type: html.ElementNode,
Data: "body",
DataAtom: atom.Body,
}
var ErrNotAnElementNode = errors.New("not an ElementNode")
func MatchElemAttr(s string, n2 *html.Node) (bool, error) {
r := strings.NewReader(s)
n, err := html.ParseFragment(r, body)
if err != nil {
return false, err
}
n1 := n[0]
if !(n1.Type == html.ElementNode &&
n2.Type == html.ElementNode) {
return false, ErrNotAnElementNode
}
if n1.Data == n2.Data &&
slices.Equal(n1.Attr, n2.Attr) {
return true, nil
}
return false, nil
}

26
match_test.go Normal file
View File

@@ -0,0 +1,26 @@
package scraper
import (
"strings"
"testing"
"golang.org/x/net/html"
)
const fragment = `<div id="main-copy"></div>`
func TestMatchElemAttr(t *testing.T) {
n, err := html.ParseFragment(strings.NewReader(fragment), body)
if err != nil {
t.Error(err)
}
n1 := n[0]
result, err := MatchElemAttr(fragment, n1)
if err != nil {
t.Error(err)
}
if !result {
t.Fail()
}
}

View File

@@ -8,7 +8,7 @@ import (
type crawlFunc func(*html.Node) type crawlFunc func(*html.Node)
func searchElem(n *html.Node, data string) chan *html.Node { func SearchElem(n *html.Node, data string) chan *html.Node {
ch := make(chan *html.Node) ch := make(chan *html.Node)
var crawl crawlFunc var crawl crawlFunc
crawl = func(n *html.Node) { crawl = func(n *html.Node) {
@@ -29,7 +29,7 @@ func searchElem(n *html.Node, data string) chan *html.Node {
return ch return ch
} }
func searchAttr(n *html.Node, key, contains string) chan *html.Node { func SearchAttr(n *html.Node, key, contains string) chan *html.Node {
ch := make(chan *html.Node) ch := make(chan *html.Node)
var crawl crawlFunc var crawl crawlFunc
crawl = func(n *html.Node) { crawl = func(n *html.Node) {
@@ -52,14 +52,14 @@ func searchAttr(n *html.Node, key, contains string) chan *html.Node {
return ch return ch
} }
func searchElemAttr(n *html.Node, elem, key, value string) chan *html.Node { func SearchElemAttr(n *html.Node, elem, key, value string) chan *html.Node {
ch := make(chan *html.Node) ch := make(chan *html.Node)
go func() { go func() {
defer close(ch) defer close(ch)
for e := range searchElem(n, elem) { for e := range SearchElem(n, elem) {
// If document is too large there are // If document is too large there are
// would be a hundreds of goroutines :(( // would be a hundreds of goroutines :((
for attr := range searchAttr(e, key, value) { for attr := range SearchAttr(e, key, value) {
ch <- attr ch <- attr
} }
} }
@@ -67,7 +67,7 @@ func searchElemAttr(n *html.Node, elem, key, value string) chan *html.Node {
return ch return ch
} }
func crawlText(n *html.Node) string { func CrawlText(n *html.Node) string {
var s = new(strings.Builder) var s = new(strings.Builder)
var crawl crawlFunc var crawl crawlFunc
crawl = func(n *html.Node) { crawl = func(n *html.Node) {

View File

@@ -15,7 +15,7 @@ func TestSearchElem(t *testing.T) {
html.Parse(strings.NewReader(htmlStr)), html.Parse(strings.NewReader(htmlStr)),
) )
ch := searchElem(doc, "span") ch := SearchElem(doc, "span")
count := 0 count := 0
for range ch { for range ch {
count++ count++
@@ -31,7 +31,7 @@ func TestSearchAttr(t *testing.T) {
html.Parse(strings.NewReader(htmlStr)), html.Parse(strings.NewReader(htmlStr)),
) )
ch := searchAttr(doc, "id", "test") ch := SearchAttr(doc, "id", "test")
count := 0 count := 0
for range ch { for range ch {
count++ count++
@@ -47,7 +47,7 @@ func TestSearchElemAttr(t *testing.T) {
html.Parse(strings.NewReader(htmlStr)), html.Parse(strings.NewReader(htmlStr)),
) )
ch := searchElemAttr(doc, "span", "id", "test") ch := SearchElemAttr(doc, "span", "id", "test")
count := 0 count := 0
for range ch { for range ch {
count++ count++