dharma/main.go

173 lines
3.5 KiB
Go

package main
import (
"flag"
"log"
"net/url"
"os"
"strings"
"sync"
"time"
"github.com/gocolly/colly"
)
// https://git.inter-media.net/gocolly/colly/commit/5cdc2aa8d8d430faee9bb88b9746545cba315a77
// RemoveAsciiTabAndNewlines removes the corresponding characters
// according to step 3 of https://url.spec.whatwg.org/#concept-basic-url-parser.
// Although step 2 says "validation error", this is not a hard error,
// and browsers do in fact just silently remove those.
//
// This function is mostly used internally, but it's exported for extra
// convenience.
func RemoveAsciiTabAndNewlines(s string) string {
return strings.Map(func(r rune) rune {
switch r {
case '\t', '\n', '\r':
return -1
default:
return r
}
}, s)
}
func parseAttr(e *colly.HTMLElement, attr string) (*url.URL, error) {
href := RemoveAsciiTabAndNewlines(e.Attr(attr))
if !strings.HasPrefix(href, "http") {
href = e.Request.AbsoluteURL(href)
}
u, err := url.Parse(href)
if err != nil {
log.Printf("in: %s", e.Request.URL.String())
log.Printf("error parsing %s: %s", attr, err)
return nil, err
}
return u, nil
}
type References struct {
urls map[string]map[string]*url.URL
urlMu sync.RWMutex
}
func (r *References) From(refLink string) (result []*url.URL) {
r.urlMu.RLock()
defer r.urlMu.RUnlock()
ref, exists := r.urls[refLink]
if !exists {
return
}
for _, v := range ref {
result = append(result, v)
}
return
}
func (r *References) Register(base *url.URL, refLink *url.URL) error {
r.urlMu.Lock()
defer r.urlMu.Unlock()
ref, exists := r.urls[refLink.String()]
if !exists {
r.urls[refLink.String()] = make(map[string]*url.URL)
r.urls[refLink.String()][base.String()] = base
} else {
ref[base.String()] = base
}
return nil
}
func NewReferences() *References {
return &References{
urls: make(map[string]map[string]*url.URL),
urlMu: sync.RWMutex{},
}
}
func main() {
fs := flag.NewFlagSet("dharma", flag.ExitOnError)
startUrl := fs.String("url", "", "Url to start analyzing")
if err := fs.Parse(os.Args); err != nil {
log.Println(err)
fs.Usage()
return
}
if *startUrl == "" {
fs.Usage()
return
}
errors := make(map[string]int)
errorsMu := sync.RWMutex{}
references := NewReferences()
c := colly.NewCollector(
// colly.AllowedDomains(hostname),
// colly.Async(true),
)
c.OnHTML("[href]", func(e *colly.HTMLElement) {
if e.Request.URL.Host != *startUrl {
return
}
href, err := parseAttr(e, "href")
if err != nil {
return
}
references.Register(e.Request.URL, href)
e.Request.Visit(e.Attr("href"))
})
c.OnHTML("[src]", func(e *colly.HTMLElement) {
if e.Request.URL.Host != *startUrl {
return
}
href, err := parseAttr(e, "src")
if err != nil {
return
}
references.Register(e.Request.URL, href)
e.Request.Visit(e.Attr("src"))
})
c.OnResponse(func(r *colly.Response) {
// log.Printf("Visited %s: %d", r.Request.URL, r.StatusCode)
})
c.OnError(func(r *colly.Response, e error) {
log.Printf("%d error %s: %s ", r.StatusCode, r.Request.URL.String(), e)
errorsMu.Lock()
errors[r.Request.URL.String()] = r.StatusCode
errorsMu.Unlock()
})
c.Limit(&colly.LimitRule{Parallelism: 2})
c.SetRequestTimeout(10 * time.Second)
c.Visit(*startUrl)
c.Wait()
for errUrl, statusCode := range errors {
log.Printf("[%d] %s", statusCode, errUrl)
parsedURL, _ := url.Parse(errUrl)
log.Println(" Found in:")
for _, r := range references.From(parsedURL.String()) {
log.Printf(" - %s", r)
}
}
}