Compare commits
3 Commits
3bbd6c72ca
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
| 55a6a4d3ef | |||
| 78985687a1 | |||
| 5cb8665a08 |
5
go.mod
5
go.mod
@@ -1,5 +0,0 @@
|
|||||||
module git.nkpl.cc/XoxJlopeZi4BB/scraper.git
|
|
||||||
|
|
||||||
go 1.25.5
|
|
||||||
|
|
||||||
require golang.org/x/net v0.48.0
|
|
||||||
2
go.sum
2
go.sum
@@ -1,2 +0,0 @@
|
|||||||
golang.org/x/net v0.48.0 h1:zyQRTTrjc33Lhh0fBgT/H3oZq9WuvRR5gPC70xpDiQU=
|
|
||||||
golang.org/x/net v0.48.0/go.mod h1:+ndRgGjkh8FGtu1w1FGbEC31if4VrNVMuKTgcAAnQRY=
|
|
||||||
38
match.go
Normal file
38
match.go
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
package scraper
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
"slices"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"golang.org/x/net/html"
|
||||||
|
"golang.org/x/net/html/atom"
|
||||||
|
)
|
||||||
|
|
||||||
|
var body = &html.Node{
|
||||||
|
Type: html.ElementNode,
|
||||||
|
Data: "body",
|
||||||
|
DataAtom: atom.Body,
|
||||||
|
}
|
||||||
|
|
||||||
|
var ErrNotAnElementNode = errors.New("not an ElementNode")
|
||||||
|
|
||||||
|
func MatchElemAttr(s string, n2 *html.Node) (bool, error) {
|
||||||
|
r := strings.NewReader(s)
|
||||||
|
n, err := html.ParseFragment(r, body)
|
||||||
|
if err != nil {
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
n1 := n[0]
|
||||||
|
|
||||||
|
if !(n1.Type == html.ElementNode &&
|
||||||
|
n2.Type == html.ElementNode) {
|
||||||
|
return false, ErrNotAnElementNode
|
||||||
|
}
|
||||||
|
|
||||||
|
if n1.Data == n2.Data &&
|
||||||
|
slices.Equal(n1.Attr, n2.Attr) {
|
||||||
|
return true, nil
|
||||||
|
}
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
26
match_test.go
Normal file
26
match_test.go
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
package scraper
|
||||||
|
|
||||||
|
import (
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"golang.org/x/net/html"
|
||||||
|
)
|
||||||
|
|
||||||
|
const fragment = `<div id="main-copy"></div>`
|
||||||
|
|
||||||
|
func TestMatchElemAttr(t *testing.T) {
|
||||||
|
n, err := html.ParseFragment(strings.NewReader(fragment), body)
|
||||||
|
if err != nil {
|
||||||
|
t.Error(err)
|
||||||
|
}
|
||||||
|
n1 := n[0]
|
||||||
|
|
||||||
|
result, err := MatchElemAttr(fragment, n1)
|
||||||
|
if err != nil {
|
||||||
|
t.Error(err)
|
||||||
|
}
|
||||||
|
if !result {
|
||||||
|
t.Fail()
|
||||||
|
}
|
||||||
|
}
|
||||||
12
scraper.go
12
scraper.go
@@ -8,7 +8,7 @@ import (
|
|||||||
|
|
||||||
type crawlFunc func(*html.Node)
|
type crawlFunc func(*html.Node)
|
||||||
|
|
||||||
func searchElem(n *html.Node, data string) chan *html.Node {
|
func SearchElem(n *html.Node, data string) chan *html.Node {
|
||||||
ch := make(chan *html.Node)
|
ch := make(chan *html.Node)
|
||||||
var crawl crawlFunc
|
var crawl crawlFunc
|
||||||
crawl = func(n *html.Node) {
|
crawl = func(n *html.Node) {
|
||||||
@@ -29,7 +29,7 @@ func searchElem(n *html.Node, data string) chan *html.Node {
|
|||||||
return ch
|
return ch
|
||||||
}
|
}
|
||||||
|
|
||||||
func searchAttr(n *html.Node, key, contains string) chan *html.Node {
|
func SearchAttr(n *html.Node, key, contains string) chan *html.Node {
|
||||||
ch := make(chan *html.Node)
|
ch := make(chan *html.Node)
|
||||||
var crawl crawlFunc
|
var crawl crawlFunc
|
||||||
crawl = func(n *html.Node) {
|
crawl = func(n *html.Node) {
|
||||||
@@ -52,14 +52,14 @@ func searchAttr(n *html.Node, key, contains string) chan *html.Node {
|
|||||||
return ch
|
return ch
|
||||||
}
|
}
|
||||||
|
|
||||||
func searchElemAttr(n *html.Node, elem, key, value string) chan *html.Node {
|
func SearchElemAttr(n *html.Node, elem, key, value string) chan *html.Node {
|
||||||
ch := make(chan *html.Node)
|
ch := make(chan *html.Node)
|
||||||
go func() {
|
go func() {
|
||||||
defer close(ch)
|
defer close(ch)
|
||||||
for e := range searchElem(n, elem) {
|
for e := range SearchElem(n, elem) {
|
||||||
// If document is too large there are
|
// If document is too large there are
|
||||||
// would be a hundreds of goroutines :((
|
// would be a hundreds of goroutines :((
|
||||||
for attr := range searchAttr(e, key, value) {
|
for attr := range SearchAttr(e, key, value) {
|
||||||
ch <- attr
|
ch <- attr
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -67,7 +67,7 @@ func searchElemAttr(n *html.Node, elem, key, value string) chan *html.Node {
|
|||||||
return ch
|
return ch
|
||||||
}
|
}
|
||||||
|
|
||||||
func crawlText(n *html.Node) string {
|
func CrawlText(n *html.Node) string {
|
||||||
var s = new(strings.Builder)
|
var s = new(strings.Builder)
|
||||||
var crawl crawlFunc
|
var crawl crawlFunc
|
||||||
crawl = func(n *html.Node) {
|
crawl = func(n *html.Node) {
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ func TestSearchElem(t *testing.T) {
|
|||||||
html.Parse(strings.NewReader(htmlStr)),
|
html.Parse(strings.NewReader(htmlStr)),
|
||||||
)
|
)
|
||||||
|
|
||||||
ch := searchElem(doc, "span")
|
ch := SearchElem(doc, "span")
|
||||||
count := 0
|
count := 0
|
||||||
for range ch {
|
for range ch {
|
||||||
count++
|
count++
|
||||||
@@ -31,7 +31,7 @@ func TestSearchAttr(t *testing.T) {
|
|||||||
html.Parse(strings.NewReader(htmlStr)),
|
html.Parse(strings.NewReader(htmlStr)),
|
||||||
)
|
)
|
||||||
|
|
||||||
ch := searchAttr(doc, "id", "test")
|
ch := SearchAttr(doc, "id", "test")
|
||||||
count := 0
|
count := 0
|
||||||
for range ch {
|
for range ch {
|
||||||
count++
|
count++
|
||||||
@@ -47,7 +47,7 @@ func TestSearchElemAttr(t *testing.T) {
|
|||||||
html.Parse(strings.NewReader(htmlStr)),
|
html.Parse(strings.NewReader(htmlStr)),
|
||||||
)
|
)
|
||||||
|
|
||||||
ch := searchElemAttr(doc, "span", "id", "test")
|
ch := SearchElemAttr(doc, "span", "id", "test")
|
||||||
count := 0
|
count := 0
|
||||||
for range ch {
|
for range ch {
|
||||||
count++
|
count++
|
||||||
|
|||||||
Reference in New Issue
Block a user