package main import ( "flag" "log" "net/url" "os" "strings" "sync" "time" "github.com/gocolly/colly" ) // https://git.inter-media.net/gocolly/colly/commit/5cdc2aa8d8d430faee9bb88b9746545cba315a77 // RemoveAsciiTabAndNewlines removes the corresponding characters // according to step 3 of https://url.spec.whatwg.org/#concept-basic-url-parser. // Although step 2 says "validation error", this is not a hard error, // and browsers do in fact just silently remove those. // // This function is mostly used internally, but it's exported for extra // convenience. func RemoveAsciiTabAndNewlines(s string) string { return strings.Map(func(r rune) rune { switch r { case '\t', '\n', '\r': return -1 default: return r } }, s) } func parseAttr(e *colly.HTMLElement, attr string) (*url.URL, error) { href := RemoveAsciiTabAndNewlines(e.Attr(attr)) if !strings.HasPrefix(href, "http") { href = e.Request.AbsoluteURL(href) } u, err := url.Parse(href) if err != nil { log.Printf("in: %s", e.Request.URL.String()) log.Printf("error parsing %s: %s", attr, err) return nil, err } return u, nil } type References struct { urls map[string]map[string]*url.URL urlMu sync.RWMutex } func (r *References) From(refLink string) (result []*url.URL) { r.urlMu.RLock() defer r.urlMu.RUnlock() ref, exists := r.urls[refLink] if !exists { return } for _, v := range ref { result = append(result, v) } return } func (r *References) Register(base *url.URL, refLink *url.URL) error { r.urlMu.Lock() defer r.urlMu.Unlock() ref, exists := r.urls[refLink.String()] if !exists { r.urls[refLink.String()] = make(map[string]*url.URL) r.urls[refLink.String()][base.String()] = base } else { ref[base.String()] = base } return nil } func NewReferences() *References { return &References{ urls: make(map[string]map[string]*url.URL), urlMu: sync.RWMutex{}, } } func main() { fs := flag.NewFlagSet("dharma", flag.ExitOnError) startUrl := fs.String("url", "", "Url to start analyzing") if err := fs.Parse(os.Args); err != nil { log.Println(err) fs.Usage() return } if *startUrl == "" { fs.Usage() return } errors := make(map[string]int) errorsMu := sync.RWMutex{} references := NewReferences() c := colly.NewCollector( // colly.AllowedDomains(hostname), // colly.Async(true), ) c.OnHTML("[href]", func(e *colly.HTMLElement) { if e.Request.URL.Host != *startUrl { return } href, err := parseAttr(e, "href") if err != nil { return } references.Register(e.Request.URL, href) e.Request.Visit(e.Attr("href")) }) c.OnHTML("[src]", func(e *colly.HTMLElement) { if e.Request.URL.Host != *startUrl { return } href, err := parseAttr(e, "src") if err != nil { return } references.Register(e.Request.URL, href) e.Request.Visit(e.Attr("src")) }) c.OnResponse(func(r *colly.Response) { // log.Printf("Visited %s: %d", r.Request.URL, r.StatusCode) }) c.OnError(func(r *colly.Response, e error) { log.Printf("%d error %s: %s ", r.StatusCode, r.Request.URL.String(), e) errorsMu.Lock() errors[r.Request.URL.String()] = r.StatusCode errorsMu.Unlock() }) c.Limit(&colly.LimitRule{Parallelism: 2}) c.SetRequestTimeout(10 * time.Second) c.Visit(*startUrl) c.Wait() for errUrl, statusCode := range errors { log.Printf("[%d] %s", statusCode, errUrl) parsedURL, _ := url.Parse(errUrl) log.Println(" Found in:") for _, r := range references.From(parsedURL.String()) { log.Printf(" - %s", r) } } }