search_node.go

This commit is contained in:
2026-03-02 22:26:28 +03:00
parent 33b08bdd00
commit 7db719e3ff
9 changed files with 134 additions and 17 deletions

5
go.mod
View File

@@ -1,5 +0,0 @@
module git.nkpl.cc/twocookedfaggots/scraper
go 1.25.5
require golang.org/x/net v0.50.0

2
go.sum
View File

@@ -1,2 +0,0 @@
golang.org/x/net v0.50.0 h1:ucWh9eiCGyDR3vtzso0WMQinm2Dnt8cFMuQa9K33J60=
golang.org/x/net v0.50.0/go.mod h1:UgoSli3F/pBgdJBHCTc+tp3gmrU4XswgGRgtnwWTfyM=

View File

@@ -1,3 +1,4 @@
//go:build ignore
package scraper package scraper
import ( import (
@@ -17,7 +18,7 @@ var body = &html.Node{
var ErrNotAnElementNode = errors.New("not a html.ElementNode") var ErrNotAnElementNode = errors.New("not a html.ElementNode")
func parseFragment(s string) (*html.Node, error) { func ParseFragment(s string) (*html.Node, error) {
n, err := html.ParseFragment(strings.NewReader(s), body) n, err := html.ParseFragment(strings.NewReader(s), body)
if err != nil { if err != nil {
return nil, err return nil, err
@@ -33,13 +34,13 @@ func MatchElemAttr(s string, n *html.Node) (bool, error) {
if n1.Type != html.ElementNode { if n1.Type != html.ElementNode {
return false, errors.Join( return false, errors.Join(
ErrNotAnElementNode, ErrNotAnElementNode,
errors.New("s isn's a html.ElementNode"), errors.New("s isn's a html.ElementNode"),
) )
} }
if n.Type != html.ElementNode { if n.Type != html.ElementNode {
return false, errors.Join( return false, errors.Join(
ErrNotAnElementNode, ErrNotAnElementNode,
errors.New("n isn's a html.ElementNode"), errors.New("n isn's a html.ElementNode"),
) )
} }
@@ -51,7 +52,6 @@ func MatchElemAttr(s string, n *html.Node) (bool, error) {
return false, nil return false, nil
} }
func SearchElemAttr2(s string, n2 *html.Node) (chan *html.Node, error) { func SearchElemAttr2(s string, n2 *html.Node) (chan *html.Node, error) {
ch := make(chan *html.Node) ch := make(chan *html.Node)
var crawl func(*html.Node) var crawl func(*html.Node)

View File

@@ -1,3 +1,4 @@
//go:build ignore
package scraper package scraper
import ( import (
@@ -24,3 +25,5 @@ func TestMatchElemAttr(t *testing.T) {
t.Fail() t.Fail()
} }
} }

View File

@@ -42,7 +42,12 @@ func SearchAttr(n *html.Node, key, contains string) chan *html.Node {
crawl = func(n *html.Node) { crawl = func(n *html.Node) {
for _, a := range n.Attr { for _, a := range n.Attr {
if a.Key == key && if a.Key == key &&
containsWord(a.Val, contains) { func() bool {
if contains == "" {
return true
}
return containsWord(a.Val, contains)
}() {
ch <- n ch <- n
} }
} }
@@ -81,11 +86,11 @@ func CrawlText(n *html.Node) string {
if n.Type == html.TextNode { if n.Type == html.TextNode {
Must( Must(
s.WriteString(func() string { s.WriteString(func() string {
trimmed := strings.TrimSpace(n.Data) trimmed := strings.TrimSpace(n.Data)
if n.Data == "" { if n.Data == "" {
return trimmed return trimmed
} }
return trimmed+"\n" return trimmed + "\n"
}()), }()),
) )
} }
@@ -98,3 +103,12 @@ func CrawlText(n *html.Node) string {
crawl(n) crawl(n)
return s.String() return s.String()
} }
func ExtractAttrValue(n *html.Node, key string) string {
for _, a := range n.Attr {
if a.Key == key {
return a.Val
}
}
return ""
}

40
search_node.go Normal file
View File

@@ -0,0 +1,40 @@
package scraper
import "golang.org/x/net/html"
func MatchNode(n1 *html.Node, n2 *html.Node) bool {
if n1.Data == n2.Data {
if len(n1.Attr) == 0 ||
len(n2.Attr) == 0 {
return false
}
for i, a1 := range n1.Attr {
a2 := n2.Attr[i]
return a1.Key == a2.Key
}
}
return false
}
func SearchNode(target *html.Node, n *html.Node) chan *html.Node {
ch := make(chan *html.Node)
var crawl crawlFunc
crawl = func(n *html.Node) {
if n.Type == html.ElementNode {
if MatchNode(target, n) {
ch <- n
}
}
c := n.FirstChild
for c != nil {
crawl(c)
c = c.NextSibling
}
}
go func() {
defer close(ch)
crawl(n)
}()
return ch
}

48
search_node_test.go Normal file
View File

@@ -0,0 +1,48 @@
package scraper
import (
"strings"
"testing"
"golang.org/x/net/html"
)
/*
var body = &html.Node{
Type: html.ElementNode,
Data: "body",
DataAtom: atom.Body,
}
*/
const fragment = `<div id="main-copy">something</div>`
func TestSearchNode(t *testing.T) {
doc1, err := html.ParseFragment(strings.NewReader(fragment), body)
if err != nil {
t.Error(err)
}
n1 := doc1[0]
doc2, err := html.ParseFragment(strings.NewReader(fragment), body)
if err != nil {
t.Error(err)
}
n2 := doc2[0]
ch := SearchNode(n1, n2)
t.Log(CrawlText(<-ch))
}
func TestMatchNode(t *testing.T) {
doc1, err := html.ParseFragment(strings.NewReader(fragment), body)
if err != nil {
t.Error(err)
}
n1 := doc1[0]
doc2, err := html.ParseFragment(strings.NewReader(fragment), body)
if err != nil {
t.Error(err)
}
n2 := doc2[0]
t.Log(MatchNode(n1, n2))
}

View File

@@ -10,12 +10,14 @@ import (
const htmlStr = `<div><span id="test">Hello</span><span id="test">World</span></div>` const htmlStr = `<div><span id="test">Hello</span><span id="test">World</span></div>`
/*
func Must[T any](v T, err error) T { func Must[T any](v T, err error) T {
if err != nil { if err != nil {
panic(err) panic(err)
} }
return v return v
} }
*/
func TestSearchElem(t *testing.T) { func TestSearchElem(t *testing.T) {
doc := Must( doc := Must(
@@ -65,7 +67,7 @@ func TestSearchElemAttr(t *testing.T) {
} }
} }
func TestCrawlTest(t *testing.T) { func TestCrawlText(t *testing.T) {
fmt.Println( fmt.Println(
CrawlText(Must( CrawlText(Must(
html.Parse(strings.NewReader(htmlStr)), html.Parse(strings.NewReader(htmlStr)),

17
util.go
View File

@@ -4,6 +4,9 @@ import (
"fmt" "fmt"
"regexp" "regexp"
"strings" "strings"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
) )
func containsWord(s, substr string) (match bool) { func containsWord(s, substr string) (match bool) {
@@ -22,3 +25,17 @@ func containsWord(s, substr string) (match bool) {
} }
return return
} }
var body = &html.Node{
Type: html.ElementNode,
Data: "body",
DataAtom: atom.Body,
}
func ParseFragment(s string) (*html.Node, error) {
n, err := html.ParseFragment(strings.NewReader(s), body)
if err != nil {
return nil, err
}
return n[0], nil
}