Add initial archiver

2019-06-09 14:54:07 +07:00 · 2019-06-09 14:54:07 +07:00 · 4e38387170
parent 95c8717855
commit 4e38387170
16 changed files with 1524 additions and 38 deletions
--- a/go.mod
+++ b/go.mod
@ -20,9 +20,11 @@ require (
 	github.com/shurcooL/vfsgen v0.0.0-20181202132449-6a9ea43bcacd
 	github.com/sirupsen/logrus v1.4.2
 	github.com/spf13/cobra v0.0.4
+	github.com/tdewolff/parse/v2 v2.3.7
+	go.etcd.io/bbolt v1.3.2
 	golang.org/x/crypto v0.0.0-20190513172903-22d7a77e9e5f
 	golang.org/x/image v0.0.0-20190523035834-f03afa92d3ff // indirect
-	golang.org/x/net v0.0.0-20190522155817-f3200d17e092 // indirect
+	golang.org/x/net v0.0.0-20190522155817-f3200d17e092
 	golang.org/x/sys v0.0.0-20190526052359-791d8a0f4d09 // indirect
 	golang.org/x/tools v0.0.0-20190525145741-7be61e1b0e51 // indirect
 	google.golang.org/appengine v1.6.0 // indirect
--- a/go.sum
+++ b/go.sum
@ -80,8 +80,14 @@ github.com/stretchr/testify v1.2.2 h1:bSDNvY7ZPG5RlJ8otE/7V6gMiyenm9RtJ7IUVIAoJ1
 github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
 github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
 github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
+github.com/tdewolff/parse/v2 v2.3.7 h1:DXoTUgrUE2Eap0m7zg1ljCO5C78vhEi7HTc4YnJWrRk=
+github.com/tdewolff/parse/v2 v2.3.7/go.mod h1:HansaqmN4I/U7L6/tUp0NcwT2tFO0F4EAWYGSDzkYNk=
+github.com/tdewolff/test v1.0.0 h1:jOwzqCXr5ePXEPGJaq2ivoR6HOCi+D5TPfpoyg8yvmU=
+github.com/tdewolff/test v1.0.0/go.mod h1:DiQUlutnqlEvdvhSn2LPGy4TFwRauAaYDsL+683RNX4=
 github.com/ugorji/go/codec v0.0.0-20181204163529-d75b2dcb6bc8/go.mod h1:VFNgLljTbGfSG7qAOspJ7OScBnGdDN/yBr0sguwnwf0=
 github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q=
+go.etcd.io/bbolt v1.3.2 h1:Z/90sZLPOeCy2PwprqkFa25PdkusRzaj9P8zm/KNyvk=
+go.etcd.io/bbolt v1.3.2/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU=
 golang.org/x/crypto v0.0.0-20181203042331-505ab145d0a9/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
 golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2 h1:VklqNMn3ovrHsnt90PveolxSbWFaJdECFbxSq0Mqo2M=
 golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
--- a/internal/cmd/add.go
+++ b/internal/cmd/add.go
@ -1,12 +1,17 @@
 package cmd

 import (
+	"bytes"
 	"fmt"
+	"io"
+	"net/http"
 	nurl "net/url"
 	fp "path/filepath"
 	"strings"
 	"time"

+	"github.com/go-shiori/shiori/pkg/warc"
+
 	"github.com/go-shiori/go-readability"
 	"github.com/go-shiori/shiori/internal/model"
 	"github.com/spf13/cobra"
@ -73,14 +78,36 @@ func addHandler(cmd *cobra.Command, args []string) {
 		func() {
 			cInfo.Println("Downloading article...")

-			resp, err := httpClient.Get(url)
+			// Prepare request
+			req, err := http.NewRequest("GET", url, nil)
+			if err != nil {
+				cError.Printf("Failed to download article: %v\n", err)
+				return
+			}
+
+			// Send request
+			req.Header.Set("User-Agent", "Shiori/2.0.0 (+https://github.com/go-shiori/shiori)")
+			resp, err := httpClient.Do(req)
 			if err != nil {
 				cError.Printf("Failed to download article: %v\n", err)
 				return
 			}
 			defer resp.Body.Close()

-			article, err := readability.FromReader(resp.Body, url)
+			// Save as archive
+			buffer := bytes.NewBuffer(nil)
+			tee := io.TeeReader(resp.Body, buffer)
+
+			contentType := resp.Header.Get("Content-Type")
+			archivePath := fp.Join(DataDir, "archive", fmt.Sprintf("%d", book.ID))
+			err = warc.FromReader(tee, url, contentType, archivePath)
+			if err != nil {
+				cError.Printf("Failed to create archive: %v\n", err)
+				return
+			}
+
+			// Parse article
+			article, err := readability.FromReader(buffer, url)
 			if err != nil {
 				cError.Printf("Failed to parse article: %v\n", err)
 				return
--- a/internal/cmd/delete.go
+++ b/internal/cmd/delete.go
@ -4,6 +4,7 @@ import (
 	"fmt"
 	"os"
 	fp "path/filepath"
+	"strconv"
 	"strings"

 	"github.com/spf13/cobra"
@ -57,18 +58,20 @@ func deleteHandler(cmd *cobra.Command, args []string) {
 		return
 	}

-	// Delete thumbnail image from local disk
+	// Delete thumbnail image and archives from local disk
 	if len(ids) == 0 {
 		thumbDir := fp.Join(DataDir, "thumb")
+		archiveDir := fp.Join(DataDir, "archive")
 		os.RemoveAll(thumbDir)
+		os.RemoveAll(archiveDir)
 	} else {
 		for _, id := range ids {
-			imgPath := fp.Join(DataDir, "thumb", fmt.Sprintf("%d.*", id))
-			matchedFiles, _ := fp.Glob(imgPath)
+			strID := strconv.Itoa(id)
+			imgPath := fp.Join(DataDir, "thumb", strID)
+			archivePath := fp.Join(DataDir, "archive", strID)

-			for _, f := range matchedFiles {
-				os.Remove(f)
-			}
+			os.Remove(imgPath)
+			os.Remove(archivePath)
 		}
 	}

--- a/internal/cmd/open.go
+++ b/internal/cmd/open.go
@ -2,9 +2,15 @@ package cmd

 import (
 	"fmt"
+	"net"
+	"net/http"
+	fp "path/filepath"
+	"strconv"
 	"strings"

 	"github.com/go-shiori/shiori/internal/database"
+	"github.com/go-shiori/shiori/pkg/warc"
+	"github.com/julienschmidt/httprouter"
 	"github.com/spf13/cobra"
 )

@ -20,6 +26,7 @@ func openCmd() *cobra.Command {
 	}

 	cmd.Flags().BoolP("yes", "y", false, "Skip confirmation prompt and open ALL bookmarks")
+	cmd.Flags().BoolP("archive", "a", false, "Open the bookmark's archived content")
 	cmd.Flags().BoolP("text-cache", "t", false, "Open the bookmark's text cache in terminal")

 	return cmd
@ -28,8 +35,22 @@ func openCmd() *cobra.Command {
 func openHandler(cmd *cobra.Command, args []string) {
 	// Parse flags
 	skipConfirm, _ := cmd.Flags().GetBool("yes")
+	archiveMode, _ := cmd.Flags().GetBool("archive")
 	textCacheMode, _ := cmd.Flags().GetBool("text-cache")

+	// Convert args to ids
+	ids, err := parseStrIndices(args)
+	if err != nil {
+		cError.Println(err)
+		return
+	}
+
+	// If in archive mode, only one bookmark allowed
+	if len(ids) > 1 && archiveMode {
+		cError.Println("In archive mode, only one bookmark allowed")
+		return
+	}
+
 	// If no arguments (i.e all bookmarks will be opened),
 	// confirm to user
 	if len(args) == 0 && !skipConfirm {
@ -42,13 +63,6 @@ func openHandler(cmd *cobra.Command, args []string) {
 		}
 	}

-	// Convert args to ids
-	ids, err := parseStrIndices(args)
-	if err != nil {
-		cError.Println(err)
-		return
-	}
-
 	// Read bookmarks from database
 	getOptions := database.GetBookmarksOptions{
 		IDs:         ids,
@ -62,17 +76,16 @@ func openHandler(cmd *cobra.Command, args []string) {
 	}

 	if len(bookmarks) == 0 {
-		switch {
-		case len(ids) > 0:
+		if len(ids) > 0 {
 			cError.Println("No matching index found")
-		default:
+		} else {
 			cError.Println("No bookmarks saved yet")
 		}
 		return
 	}

-	// If not text cache mode, open bookmarks in browser
-	if !textCacheMode {
+	// If not text cache mode nor archive mode, open bookmarks in browser
+	if !textCacheMode && !archiveMode {
 		for _, book := range bookmarks {
 			err = openBrowser(book.URL)
 			if err != nil {
@ -83,22 +96,74 @@ func openHandler(cmd *cobra.Command, args []string) {
 	}

 	// Show bookmarks content in terminal
-	termWidth := getTerminalWidth()
+	if textCacheMode {
+		termWidth := getTerminalWidth()

-	for _, book := range bookmarks {
-		cIndex.Printf("%d. ", book.ID)
-		cTitle.Println(book.Title)
-		fmt.Println()
+		for _, book := range bookmarks {
+			cIndex.Printf("%d. ", book.ID)
+			cTitle.Println(book.Title)
+			fmt.Println()

-		if book.Content == "" {
-			cError.Println("This bookmark doesn't have any cached content")
-		} else {
-			book.Content = strings.Join(strings.Fields(book.Content), " ")
-			fmt.Println(book.Content)
+			if book.Content == "" {
+				cError.Println("This bookmark doesn't have any cached content")
+			} else {
+				book.Content = strings.Join(strings.Fields(book.Content), " ")
+				fmt.Println(book.Content)
+			}
+
+			fmt.Println()
+			cSymbol.Println(strings.Repeat("=", termWidth))
+			fmt.Println()
+		}
+	}
+
+	// Open archive
+	id := strconv.Itoa(bookmarks[0].ID)
+	archivePath := fp.Join(DataDir, "archive", id)
+
+	archive, err := warc.Open(archivePath)
+	if err != nil {
+		cError.Printf("Failed to open archive: %v\n", err)
+		return
+	}
+	defer archive.Close()
+
+	// Create simple server
+	router := httprouter.New()
+	router.GET("/*filename", func(w http.ResponseWriter, r *http.Request, ps httprouter.Params) {
+		filename := ps.ByName("filename")
+		resourceName := fp.Base(filename)
+		if resourceName == "/" {
+			resourceName = ""
 		}

-		fmt.Println()
-		cSymbol.Println(strings.Repeat("=", termWidth))
-		fmt.Println()
+		content, contentType, err := archive.Read(resourceName)
+		if err != nil {
+			panic(err)
+		}
+
+		w.Header().Set("Content-Type", contentType)
+		if _, err = w.Write(content); err != nil {
+			panic(err)
+		}
+	})
+
+	router.PanicHandler = func(w http.ResponseWriter, r *http.Request, arg interface{}) {
+		http.Error(w, fmt.Sprint(arg), 500)
+	}
+
+	// Choose random port
+	listener, err := net.Listen("tcp", ":0")
+	if err != nil {
+		cError.Printf("Failed to serve archive: %v\n", err)
+		return
+	}
+
+	portNumber := listener.Addr().(*net.TCPAddr).Port
+	cInfo.Printf("Archive served in http://localhost:%d\n", portNumber)
+
+	err = http.Serve(listener, router)
+	if err != nil {
+		cError.Printf("Failed to serve archive: %v\n", err)
 	}
 }
--- a/internal/cmd/root.go
+++ b/internal/cmd/root.go
@ -1,7 +1,9 @@
 package cmd

 import (
+	"crypto/tls"
 	"net/http"
+	"net/http/cookiejar"
 	"time"

 	"github.com/go-shiori/shiori/internal/database"
@ -15,9 +17,22 @@ var (
 	// DataDir is directory for downloaded data
 	DataDir string

-	httpClient = &http.Client{Timeout: time.Minute}
+	httpClient *http.Client
 )

+func init() {
+	jar, _ := cookiejar.New(nil)
+	httpClient = &http.Client{
+		Timeout: time.Minute,
+		Transport: &http.Transport{
+			TLSClientConfig: &tls.Config{
+				InsecureSkipVerify: true,
+			},
+		},
+		Jar: jar,
+	}
+}
+
 // ShioriCmd returns the root command for shiori
 func ShioriCmd() *cobra.Command {
 	rootCmd := &cobra.Command{
--- a/internal/cmd/update.go
+++ b/internal/cmd/update.go
@ -1,7 +1,10 @@
 package cmd

 import (
+	"bytes"
 	"fmt"
+	"io"
+	"net/http"
 	nurl "net/url"
 	fp "path/filepath"
 	"sort"
@ -12,6 +15,7 @@ import (
 	"github.com/go-shiori/go-readability"
 	"github.com/go-shiori/shiori/internal/database"
 	"github.com/go-shiori/shiori/internal/model"
+	"github.com/go-shiori/shiori/pkg/warc"
 	"github.com/spf13/cobra"
 )

@ -139,8 +143,17 @@ func updateHandler(cmd *cobra.Command, args []string) {
 					<-semaphore
 				}()

-				// Download article
-				resp, err := httpClient.Get(book.URL)
+				// Prepare request
+				req, err := http.NewRequest("GET", book.URL, nil)
+				if err != nil {
+					chProblem <- book.ID
+					chMessage <- fmt.Errorf("Failed to download %s: %v", book.URL, err)
+					return
+				}
+
+				// Send request
+				req.Header.Set("User-Agent", "Shiori/2.0.0 (+https://github.com/go-shiori/shiori)")
+				resp, err := httpClient.Do(req)
 				if err != nil {
 					chProblem <- book.ID
 					chMessage <- fmt.Errorf("Failed to download %s: %v", book.URL, err)
@ -148,7 +161,21 @@ func updateHandler(cmd *cobra.Command, args []string) {
 				}
 				defer resp.Body.Close()

-				article, err := readability.FromReader(resp.Body, book.URL)
+				// Save as archive
+				buffer := bytes.NewBuffer(nil)
+				tee := io.TeeReader(resp.Body, buffer)
+
+				contentType := resp.Header.Get("Content-Type")
+				archivePath := fp.Join(DataDir, "archive", fmt.Sprintf("%d", book.ID))
+				err = warc.FromReader(tee, book.URL, contentType, archivePath)
+				if err != nil {
+					chProblem <- book.ID
+					chMessage <- fmt.Errorf("Failed to create archive %s: %v", book.URL, err)
+					return
+				}
+
+				// Parse article
+				article, err := readability.FromReader(buffer, book.URL)
 				if err != nil {
 					chProblem <- book.ID
 					chMessage <- fmt.Errorf("Failed to parse %s: %v", book.URL, err)
--- a/pkg/warc/internal/archiver/archiver.go
+++ b/pkg/warc/internal/archiver/archiver.go
@ -0,0 +1,173 @@
+package archiver
+
+import (
+	"fmt"
+	"strings"
+	"sync"
+	"time"
+
+	"go.etcd.io/bbolt"
+)
+
+// Archiver is struct for archiving an URL and its resources.
+type Archiver struct {
+	sync.RWMutex
+	sync.WaitGroup
+
+	DB          *bbolt.DB
+	ChDone      chan struct{}
+	ChErrors    chan error
+	ChWarnings  chan error
+	ChRequest   chan ResourceURL
+	ResourceMap map[string]struct{}
+	LogEnabled  bool
+}
+
+// Close closes channels that used by the Archiver.
+func (arc *Archiver) Close() {
+	close(arc.ChErrors)
+	close(arc.ChWarnings)
+	close(arc.ChRequest)
+}
+
+// StartArchiver starts the archival process.
+func (arc *Archiver) StartArchiver() []error {
+	go func() {
+		time.Sleep(time.Second)
+		arc.Wait()
+		close(arc.ChDone)
+	}()
+
+	// Download the URL concurrently.
+	// After download finished, parse response to extract resources
+	// URL inside it. After that, send it to channel to download again.
+	errors := make([]error, 0)
+	warnings := make([]error, 0)
+
+	func() {
+		for {
+			select {
+			case <-arc.ChDone:
+				return
+			case err := <-arc.ChErrors:
+				errors = append(errors, err)
+			case err := <-arc.ChWarnings:
+				warnings = append(warnings, err)
+			case res := <-arc.ChRequest:
+				arc.RLock()
+				_, exist := arc.ResourceMap[res.DownloadURL]
+				arc.RUnlock()
+
+				if !exist {
+					arc.Add(1)
+					go arc.archive(res)
+				}
+			}
+		}
+	}()
+
+	// Print log message if required
+	if arc.LogEnabled {
+		nErrors := len(errors)
+		nWarnings := len(warnings)
+		arc.Logf(infoLog, "Download finished with %d warnings and %d errors\n", nWarnings, nErrors)
+
+		if nWarnings > 0 {
+			fmt.Println()
+			for _, warning := range warnings {
+				arc.Log(warningLog, warning)
+			}
+		}
+
+		if nErrors > 0 {
+			for _, err := range errors {
+				arc.Log(errorLog, err)
+			}
+		}
+	}
+
+	return nil
+}
+
+// archive downloads a subresource and save it to storage.
+func (arc *Archiver) archive(res ResourceURL) {
+	// Make sure to decrease wait group once finished
+	defer arc.Done()
+
+	// Download resource
+	resp, err := DownloadData(res.DownloadURL)
+	if err != nil {
+		arc.ChErrors <- fmt.Errorf("failed to download %s: %v", res.DownloadURL, err)
+		return
+	}
+	defer resp.Body.Close()
+
+	// Process resource depending on its type.
+	// Since this `archive` method only used for processing sub
+	// resource, we will only process the CSS sub resources.
+	// For other file, we will simply download it as it is.
+	var result ProcessResult
+	var subResources []ResourceURL
+	cType := resp.Header.Get("Content-Type")
+
+	switch {
+	case strings.Contains(cType, "text/css"):
+		result, subResources, err = arc.ProcessCSSFile(res, resp.Body)
+	default:
+		result, err = arc.ProcessOtherFile(res, resp.Body)
+	}
+
+	if err != nil {
+		arc.ChErrors <- fmt.Errorf("failed to process %s: %v", res.DownloadURL, err)
+		return
+	}
+
+	// Add this url to resource map
+	arc.Lock()
+	arc.ResourceMap[res.DownloadURL] = struct{}{}
+	arc.Unlock()
+
+	// Save content to storage
+	arc.Logf(infoLog, "Downloaded %s, parent %s", res.DownloadURL, res.Parent)
+
+	result.ContentType = cType
+	err = arc.SaveToStorage(result)
+	if err != nil {
+		arc.ChErrors <- fmt.Errorf("failed to save %s: %v", res.DownloadURL, err)
+		return
+	}
+
+	// Send sub resource to request channel
+	for _, subRes := range subResources {
+		arc.ChRequest <- subRes
+	}
+}
+
+// SaveToStorage save processing result to storage.
+func (arc *Archiver) SaveToStorage(result ProcessResult) error {
+	err := arc.DB.Batch(func(tx *bbolt.Tx) error {
+		bucket := tx.Bucket([]byte(result.Name))
+		if bucket != nil {
+			return nil
+		}
+
+		bucket, err := tx.CreateBucketIfNotExists([]byte(result.Name))
+		if err != nil {
+			return err
+		}
+
+		err = bucket.Put([]byte("content"), result.Content)
+		if err != nil {
+			return err
+		}
+
+		err = bucket.Put([]byte("type"), []byte(result.ContentType))
+		if err != nil {
+			return err
+		}
+
+		return nil
+	})
+
+	return err
+}
--- a/pkg/warc/internal/archiver/http-client.go
+++ b/pkg/warc/internal/archiver/http-client.go
@ -0,0 +1,38 @@
+package archiver
+
+import (
+	"crypto/tls"
+	"net/http"
+	"net/http/cookiejar"
+	"time"
+)
+
+var (
+	defaultClient *http.Client
+)
+
+func init() {
+	jar, _ := cookiejar.New(nil)
+	defaultClient = &http.Client{
+		Timeout: time.Minute,
+		Transport: &http.Transport{
+			TLSClientConfig: &tls.Config{
+				InsecureSkipVerify: true,
+			},
+		},
+		Jar: jar,
+	}
+}
+
+// DownloadData downloads data from the specified URL.
+func DownloadData(url string) (*http.Response, error) {
+	// Prepare request
+	req, err := http.NewRequest("GET", url, nil)
+	if err != nil {
+		return nil, err
+	}
+
+	// Send request
+	req.Header.Set("User-Agent", "Shiori/2.0.0 (+https://github.com/go-shiori/shiori)")
+	return defaultClient.Do(req)
+}
--- a/pkg/warc/internal/archiver/log.go
+++ b/pkg/warc/internal/archiver/log.go
@ -0,0 +1,43 @@
+package archiver
+
+import "github.com/sirupsen/logrus"
+
+type logType int
+
+const (
+	infoLog logType = iota
+	errorLog
+	warningLog
+)
+
+// Log prints the log ended with newline.
+func (arc *Archiver) Log(tp logType, msgs ...interface{}) {
+	if !arc.LogEnabled {
+		return
+	}
+
+	switch tp {
+	case errorLog:
+		logrus.Errorln(msgs...)
+	case warningLog:
+		logrus.Warnln(msgs...)
+	default:
+		logrus.Infoln(msgs...)
+	}
+}
+
+// Logf print log with specified format.
+func (arc *Archiver) Logf(tp logType, format string, msgs ...interface{}) {
+	if !arc.LogEnabled {
+		return
+	}
+
+	switch tp {
+	case errorLog:
+		logrus.Errorf(format, msgs...)
+	case warningLog:
+		logrus.Warnf(format, msgs...)
+	default:
+		logrus.Infof(format, msgs...)
+	}
+}
--- a/pkg/warc/internal/archiver/processor.go
+++ b/pkg/warc/internal/archiver/processor.go
@ -0,0 +1,468 @@
+package archiver
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	nurl "net/url"
+	"regexp"
+	"strings"
+
+	"github.com/tdewolff/parse/v2/css"
+	"github.com/tdewolff/parse/v2/js"
+	"golang.org/x/net/html"
+)
+
+// ProcessResult is the result from content processing.
+type ProcessResult struct {
+	Name        string
+	ContentType string
+	Content     []byte
+}
+
+var (
+	rxImageMeta       = regexp.MustCompile(`(?i)image|thumbnail`)
+	rxLazyImageSrcset = regexp.MustCompile(`(?i)\.(jpg|jpeg|png|webp)\s+\d`)
+	rxLazyImageSrc    = regexp.MustCompile(`(?i)^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$`)
+)
+
+// ProcessHTMLFile process HTML file that submitted through the io.Reader.
+func (arc *Archiver) ProcessHTMLFile(res ResourceURL, input io.Reader) (result ProcessResult, resources []ResourceURL, err error) {
+	// Parse HTML document
+	doc, err := html.Parse(input)
+	if err != nil {
+		return ProcessResult{}, nil, fmt.Errorf("failed to parse HTML for %s: %v", res.DownloadURL, err)
+	}
+
+	// Parse URL
+	parsedURL, err := nurl.ParseRequestURI(res.DownloadURL)
+	if err != nil || parsedURL.Scheme == "" || parsedURL.Hostname() == "" {
+		return ProcessResult{}, nil, fmt.Errorf("url %s is not valid", res.DownloadURL)
+	}
+
+	// Convert lazy loaded image to normal
+	fixLazyImages(doc)
+
+	// Convert hyperlinks rith relative URL
+	fixRelativeURIs(doc, parsedURL)
+
+	// Extract resources from each nodes
+	for _, node := range getElementsByTagName(doc, "*") {
+		// First extract resources from inline style
+		cssResources := extractInlineCSS(node, parsedURL)
+		resources = append(resources, cssResources...)
+
+		// Next extract resources from tag's specific attribute
+		nodeResources := []ResourceURL{}
+		switch tagName(node) {
+		case "style":
+			nodeResources = extractStyleTag(node, parsedURL)
+		case "script":
+			nodeResources = extractScriptTag(node, parsedURL)
+		case "meta":
+			nodeResources = extractMetaTag(node, parsedURL)
+		case "img", "picture", "figure", "video", "audio", "source":
+			nodeResources = extractMediaTag(node, parsedURL)
+		case "link":
+			nodeResources = extractGenericTag(node, "href", parsedURL)
+		case "iframe":
+			nodeResources = extractGenericTag(node, "src", parsedURL)
+		case "object":
+			nodeResources = extractGenericTag(node, "data", parsedURL)
+		default:
+			continue
+		}
+		resources = append(resources, nodeResources...)
+	}
+
+	// Get outer HTML of the doc
+	result = ProcessResult{
+		Name:    res.ArchivalURL,
+		Content: outerHTML(doc),
+	}
+
+	return result, resources, nil
+}
+
+// ProcessCSSFile process CSS file that submitted through the io.Reader.
+func (arc *Archiver) ProcessCSSFile(res ResourceURL, input io.Reader) (result ProcessResult, resources []ResourceURL, err error) {
+	// Parse URL
+	parsedURL, err := nurl.ParseRequestURI(res.DownloadURL)
+	if err != nil || parsedURL.Scheme == "" || parsedURL.Hostname() == "" {
+		return ProcessResult{}, nil, fmt.Errorf("url %s is not valid", res.DownloadURL)
+	}
+
+	// Extract CSS rules
+	rules, resources := processCSS(input, parsedURL)
+
+	result = ProcessResult{
+		Name:    res.ArchivalURL,
+		Content: []byte(rules),
+	}
+
+	return result, resources, nil
+}
+
+// ProcessOtherFile process files that not HTML, JS or CSS that submitted through the io.Reader.
+func (arc *Archiver) ProcessOtherFile(res ResourceURL, input io.Reader) (result ProcessResult, err error) {
+	// Copy data to buffer
+	buffer := bytes.NewBuffer(nil)
+
+	_, err = io.Copy(buffer, input)
+	if err != nil {
+		return ProcessResult{}, fmt.Errorf("failed to copy data: %v", err)
+	}
+
+	// Create result
+	result = ProcessResult{
+		Name:    res.ArchivalURL,
+		Content: buffer.Bytes(),
+	}
+
+	return result, nil
+}
+
+// fixRelativeURIs converts each <a> in the given element
+// to an absolute URI, ignoring #ref URIs.
+func fixRelativeURIs(doc *html.Node, pageURL *nurl.URL) {
+	links := getAllNodesWithTag(doc, "a")
+	forEachNode(links, func(link *html.Node, _ int) {
+		href := getAttribute(link, "href")
+		if href == "" {
+			return
+		}
+
+		// Replace links with javascript: URIs with text content,
+		// since they won't work after scripts have been removed
+		// from the page.
+		if strings.HasPrefix(href, "javascript:") {
+			text := createTextNode(textContent(link))
+			replaceNode(link, text)
+		} else {
+			newHref := toAbsoluteURI(href, pageURL)
+			if newHref == "" {
+				removeAttribute(link, "href")
+			} else {
+				setAttribute(link, "href", newHref)
+			}
+		}
+	})
+}
+
+// fixLazyImages convert images and figures that have properties like data-src into
+// images that can be loaded without JS.
+func fixLazyImages(root *html.Node) {
+	imageNodes := getAllNodesWithTag(root, "img", "picture", "figure")
+	forEachNode(imageNodes, func(elem *html.Node, _ int) {
+		src := getAttribute(elem, "src")
+		srcset := getAttribute(elem, "srcset")
+		nodeTag := tagName(elem)
+		nodeClass := className(elem)
+
+		if (src == "" && srcset == "") || strings.Contains(strings.ToLower(nodeClass), "lazy") {
+			for i := 0; i < len(elem.Attr); i++ {
+				attr := elem.Attr[i]
+				if attr.Key == "src" || attr.Key == "srcset" {
+					continue
+				}
+
+				copyTo := ""
+				if rxLazyImageSrcset.MatchString(attr.Val) {
+					copyTo = "srcset"
+				} else if rxLazyImageSrc.MatchString(attr.Val) {
+					copyTo = "src"
+				}
+
+				if copyTo == "" {
+					continue
+				}
+
+				if nodeTag == "img" || nodeTag == "picture" {
+					// if this is an img or picture, set the attribute directly
+					setAttribute(elem, copyTo, attr.Val)
+				} else if nodeTag == "figure" && len(getAllNodesWithTag(elem, "img", "picture")) == 0 {
+					// if the item is a <figure> that does not contain an image or picture,
+					// create one and place it inside the figure see the nytimes-3
+					// testcase for an example
+					img := createElement("img")
+					setAttribute(img, copyTo, attr.Val)
+					appendChild(elem, img)
+				}
+			}
+		}
+	})
+}
+
+// extractInlineCSS extract archive's resource from the CSS rules inside
+// style attribute. Once finished, all CSS URLs in the style attribute
+// will be updated to use the archival URL.
+func extractInlineCSS(node *html.Node, pageURL *nurl.URL) []ResourceURL {
+	// Make sure this node has inline style
+	styleAttr := getAttribute(node, "style")
+	if styleAttr == "" {
+		return nil
+	}
+
+	// Extract resource URLs from the inline style
+	// and update the CSS rules accordingly.
+	reader := strings.NewReader(styleAttr)
+	newStyleAttr, resources := processCSS(reader, pageURL)
+	setAttribute(node, "style", newStyleAttr)
+
+	return resources
+}
+
+// extractStyleTag extract archive's resource from inside a <style> tag.
+// Once finished, all CSS URLs will be updated to use the archival URL.
+func extractStyleTag(node *html.Node, pageURL *nurl.URL) []ResourceURL {
+	// Extract CSS rules from <style>
+	rules := textContent(node)
+	rules = strings.TrimSpace(rules)
+	if rules == "" {
+		return nil
+	}
+
+	// Extract resource URLs from the rules and update it accordingly.
+	reader := strings.NewReader(rules)
+	newRules, resources := processCSS(reader, pageURL)
+	setTextContent(node, newRules)
+
+	return resources
+}
+
+// extractScriptTag extract archive's resource from inside a <script> tag.
+// Once finished, all URLs inside it will be updated to use the archival URL.
+func extractScriptTag(node *html.Node, pageURL *nurl.URL) []ResourceURL {
+	// Also get the URL from `src` attribute
+	resources := extractGenericTag(node, "src", pageURL)
+
+	// Extract JS code from the <script> itself
+	script := textContent(node)
+	script = strings.TrimSpace(script)
+	if script == "" {
+		return resources
+	}
+
+	reader := strings.NewReader(script)
+	newScript, scriptResources := processJS(reader, pageURL)
+	setTextContent(node, newScript)
+	resources = append(resources, scriptResources...)
+
+	return resources
+}
+
+// extractMetaTag extract archive's resource from inside a <meta>.
+// Normally, <meta> doesn't have any resource URLs. However, as
+// social media come and grow, a new metadata is added to contain
+// the hero image for a web page, e.g. og:image, twitter:image, etc.
+// Once finished, all URLs in <meta> for image will be updated
+// to use the archival URL.
+func extractMetaTag(node *html.Node, pageURL *nurl.URL) []ResourceURL {
+	// Get the needed attributes
+	name := getAttribute(node, "name")
+	property := getAttribute(node, "property")
+	content := getAttribute(node, "content")
+
+	// If this <meta> is not for image, don't process it
+	if !rxImageMeta.MatchString(name + " " + property) {
+		return nil
+	}
+
+	// If URL is not valid, skip
+	tmp, err := nurl.ParseRequestURI(content)
+	if err != nil || tmp.Scheme == "" || tmp.Hostname() == "" {
+		return nil
+	}
+
+	// Create archive resource and update the href URL
+	res := ToResourceURL(content, pageURL)
+	if res.ArchivalURL == "" {
+		return nil
+	}
+
+	setAttribute(node, "content", res.ArchivalURL)
+	return []ResourceURL{res}
+}
+
+// extractMediaTag extract resource from inside a media tag e.g.
+// <img>, <video>, <audio>, <source>. Once finished, all URLs will be
+// updated to use the archival URL.
+func extractMediaTag(node *html.Node, pageURL *nurl.URL) []ResourceURL {
+	// Get the needed attributes
+	src := getAttribute(node, "src")
+	poster := getAttribute(node, "poster")
+	strSrcSets := getAttribute(node, "srcset")
+
+	// Create initial resources
+	resources := []ResourceURL{}
+
+	// Save `src` and `poster` to resources
+	if src != "" {
+		res := ToResourceURL(src, pageURL)
+		if res.ArchivalURL != "" {
+			setAttribute(node, "src", res.ArchivalURL)
+			resources = append(resources, res)
+		}
+	}
+
+	if poster != "" {
+		res := ToResourceURL(poster, pageURL)
+		if res.ArchivalURL != "" {
+			setAttribute(node, "poster", res.ArchivalURL)
+			resources = append(resources, res)
+		}
+	}
+
+	// Split srcset by comma, then process it like any URLs
+	srcSets := strings.Split(strSrcSets, ",")
+	for i, srcSet := range srcSets {
+		srcSet = strings.TrimSpace(srcSet)
+		parts := strings.SplitN(srcSet, " ", 2)
+		if parts[0] == "" {
+			continue
+		}
+
+		res := ToResourceURL(parts[0], pageURL)
+		if res.ArchivalURL == "" {
+			continue
+		}
+
+		srcSets[i] = strings.Replace(srcSets[i], parts[0], res.ArchivalURL, 1)
+		resources = append(resources, res)
+	}
+
+	if len(srcSets) > 0 {
+		setAttribute(node, "srcset", strings.Join(srcSets, ","))
+	}
+
+	return resources
+}
+
+// extractGenericTag extract resource from specified attribute.
+// This method is used for tags where the URL is obviously exist in
+// the tag, without any additional process needed to extract it.
+// For example is <link> with its href, <object> with its data, etc.
+// Once finished, the URL attribute will be updated to use the
+// archival URL.
+func extractGenericTag(node *html.Node, attrName string, pageURL *nurl.URL) []ResourceURL {
+	// Get the needed attributes
+	attrValue := getAttribute(node, attrName)
+	if attrValue == "" {
+		return nil
+	}
+
+	res := ToResourceURL(attrValue, pageURL)
+	if res.ArchivalURL == "" {
+		return nil
+	}
+
+	setAttribute(node, attrName, res.ArchivalURL)
+	return []ResourceURL{res}
+}
+
+// processCSSRules extract resource URLs from the specified CSS input.
+// Returns the new rules with all CSS URLs updated to the archival link.
+func processCSS(input io.Reader, baseURL *nurl.URL) (string, []ResourceURL) {
+	// Prepare buffers
+	buffer := bytes.NewBuffer(nil)
+
+	// Scan CSS file and process the resource's URL
+	lexer := css.NewLexer(input)
+	resources := []ResourceURL{}
+
+	for {
+		token, bt := lexer.Next()
+
+		// Check for error
+		if token == css.ErrorToken {
+			break
+		}
+
+		// If it's not an URL, just write it to buffer as it is
+		if token != css.URLToken {
+			buffer.Write(bt)
+			continue
+		}
+
+		// Sanitize the URL by removing `url()`, quotation mark and trailing slash
+		cssURL := string(bt)
+		cssURL = rxStyleURL.ReplaceAllString(cssURL, "$1")
+		cssURL = rxSingleQuote.ReplaceAllString(cssURL, "$1")
+		cssURL = rxDoubleQuote.ReplaceAllString(cssURL, "$1")
+
+		// Save the CSS URL and replace it with archival URL
+		res := ToResourceURL(cssURL, baseURL)
+		if res.ArchivalURL == "" {
+			continue
+		}
+
+		cssURL = `url("` + res.ArchivalURL + `")`
+		buffer.WriteString(cssURL)
+		resources = append(resources, res)
+	}
+
+	// Return the new rule after all URL has been processed
+	return buffer.String(), resources
+}
+
+// processJavascript extract resource URLs from the specified JS input.
+// Returns the new rules with all URLs updated to the archival link.
+func processJS(input io.Reader, baseURL *nurl.URL) (string, []ResourceURL) {
+	// Prepare buffers
+	buffer := bytes.NewBuffer(nil)
+
+	// Scan JS file and process the resource's URL
+	lexer := js.NewLexer(input)
+	resources := []ResourceURL{}
+
+	for {
+		token, bt := lexer.Next()
+
+		// Check for error
+		if token == js.ErrorToken {
+			break
+		}
+
+		// If it's not a string, just write it to buffer as it is
+		if token != js.StringToken {
+			buffer.Write(bt)
+			continue
+		}
+
+		// Process the string.
+		// Unlike CSS, JS doesn't have it's own URL token. So, we can only guess whether
+		// a string is URL or not. There are three criteria to decide if it's URL :
+		// - It started with http(s):// for absolute URL
+		// - It started with slash (/) for relative URL
+		// - It surrounded by `url()` just like CSS
+		// If it doesn't fulfill any of criteria above, just write it as it is.
+		var res ResourceURL
+		var newURL string
+
+		text := string(bt)
+		text = rxSingleQuote.ReplaceAllString(text, "$1")
+		text = rxDoubleQuote.ReplaceAllString(text, "$1")
+
+		if strings.HasPrefix(text, "url(") {
+			cssURL := rxStyleURL.ReplaceAllString(text, "$1")
+			cssURL = rxSingleQuote.ReplaceAllString(cssURL, "$1")
+			cssURL = rxDoubleQuote.ReplaceAllString(cssURL, "$1")
+
+			res = ToResourceURL(cssURL, baseURL)
+			newURL = fmt.Sprintf("\"url('%s')\"", res.ArchivalURL)
+		} else {
+			buffer.Write(bt)
+			continue
+		}
+
+		if res.ArchivalURL == "" {
+			continue
+		}
+
+		buffer.WriteString(newURL)
+		resources = append(resources, res)
+	}
+
+	// Return the new rule after all URL has been processed
+	return buffer.String(), resources
+}
--- a/pkg/warc/internal/archiver/resource-url.go
+++ b/pkg/warc/internal/archiver/resource-url.go
@ -0,0 +1,50 @@
+package archiver
+
+import (
+	nurl "net/url"
+	"regexp"
+	"strings"
+)
+
+var (
+	rxHTTPScheme    = regexp.MustCompile(`(?i)^https?:\/{2}`)
+	rxTrailingSlash = regexp.MustCompile(`(?i)/+$`)
+	rxRepeatedStrip = regexp.MustCompile(`(?i)-+`)
+)
+
+// ResourceURL is strcut that contains URL for downloading
+// and archiving a resource.
+type ResourceURL struct {
+	DownloadURL string
+	ArchivalURL string
+	Parent      string
+}
+
+// ToResourceURL generates an uri into a Resource URL.
+func ToResourceURL(uri string, base *nurl.URL) ResourceURL {
+	// Make sure URL has a valid scheme
+	uri = strings.TrimSpace(uri)
+	switch {
+	case uri == "",
+		strings.Contains(uri, ":") && !rxHTTPScheme.MatchString(uri):
+		return ResourceURL{}
+	}
+
+	// Create archive URL
+	downloadURL := toAbsoluteURI(uri, base)
+	downloadURL = rxTrailingSlash.ReplaceAllString(downloadURL, "")
+	downloadURL = strings.ReplaceAll(downloadURL, " ", "+")
+
+	archivalURL := strings.Replace(downloadURL, "://", "/", 1)
+	archivalURL = strings.ReplaceAll(archivalURL, "?", "-")
+	archivalURL = strings.ReplaceAll(archivalURL, "#", "-")
+	archivalURL = strings.ReplaceAll(archivalURL, "/", "-")
+	archivalURL = strings.ReplaceAll(archivalURL, " ", "-")
+	archivalURL = rxRepeatedStrip.ReplaceAllString(archivalURL, "-")
+
+	return ResourceURL{
+		DownloadURL: downloadURL,
+		ArchivalURL: archivalURL,
+		Parent:      base.String(),
+	}
+}
--- a/pkg/warc/internal/archiver/utils-dom.go
+++ b/pkg/warc/internal/archiver/utils-dom.go
@ -0,0 +1,334 @@
+package archiver
+
+import (
+	"bytes"
+	"strings"
+
+	"golang.org/x/net/html"
+)
+
+// getElementsByTagName returns a collection of all elements in the document with
+// the specified tag name, as an array of Node object.
+// The special tag "*" will represents all elements.
+func getElementsByTagName(doc *html.Node, tagName string) []*html.Node {
+	var results []*html.Node
+	var finder func(*html.Node)
+
+	finder = func(node *html.Node) {
+		if node.Type == html.ElementNode && (tagName == "*" || node.Data == tagName) {
+			results = append(results, node)
+		}
+
+		for child := node.FirstChild; child != nil; child = child.NextSibling {
+			finder(child)
+		}
+	}
+
+	for child := doc.FirstChild; child != nil; child = child.NextSibling {
+		finder(child)
+	}
+
+	return results
+}
+
+// createElement creates a new ElementNode with specified tag.
+func createElement(tagName string) *html.Node {
+	return &html.Node{
+		Type: html.ElementNode,
+		Data: tagName,
+	}
+}
+
+// createTextNode creates a new Text node.
+func createTextNode(data string) *html.Node {
+	return &html.Node{
+		Type: html.TextNode,
+		Data: data,
+	}
+}
+
+// tagName returns the tag name of a Node.
+// If it's not ElementNode, return empty string.
+func tagName(node *html.Node) string {
+	if node.Type != html.ElementNode {
+		return ""
+	}
+	return node.Data
+}
+
+// getAttribute returns the value of a specified attribute on
+// the element. If the given attribute does not exist, the value
+// returned will be an empty string.
+func getAttribute(node *html.Node, attrName string) string {
+	for i := 0; i < len(node.Attr); i++ {
+		if node.Attr[i].Key == attrName {
+			return node.Attr[i].Val
+		}
+	}
+	return ""
+}
+
+// setAttribute sets attribute for node. If attribute already exists,
+// it will be replaced.
+func setAttribute(node *html.Node, attrName string, attrValue string) {
+	attrIdx := -1
+	for i := 0; i < len(node.Attr); i++ {
+		if node.Attr[i].Key == attrName {
+			attrIdx = i
+			break
+		}
+	}
+
+	if attrIdx >= 0 {
+		node.Attr[attrIdx].Val = attrValue
+	} else {
+		node.Attr = append(node.Attr, html.Attribute{
+			Key: attrName,
+			Val: attrValue,
+		})
+	}
+}
+
+// removeAttribute removes attribute with given name.
+func removeAttribute(node *html.Node, attrName string) {
+	attrIdx := -1
+	for i := 0; i < len(node.Attr); i++ {
+		if node.Attr[i].Key == attrName {
+			attrIdx = i
+			break
+		}
+	}
+
+	if attrIdx >= 0 {
+		a := node.Attr
+		a = append(a[:attrIdx], a[attrIdx+1:]...)
+		node.Attr = a
+	}
+}
+
+// hasAttribute returns a Boolean value indicating whether the
+// specified node has the specified attribute or not.
+func hasAttribute(node *html.Node, attrName string) bool {
+	for i := 0; i < len(node.Attr); i++ {
+		if node.Attr[i].Key == attrName {
+			return true
+		}
+	}
+	return false
+}
+
+// textContent returns the text content of the specified node,
+// and all its descendants.
+func textContent(node *html.Node) string {
+	var buffer bytes.Buffer
+	var finder func(*html.Node)
+
+	finder = func(n *html.Node) {
+		if n.Type == html.TextNode {
+			buffer.WriteString(n.Data)
+		}
+
+		for child := n.FirstChild; child != nil; child = child.NextSibling {
+			finder(child)
+		}
+	}
+
+	finder(node)
+	return buffer.String()
+}
+
+// outerHTML returns an HTML serialization of the element and its descendants.
+func outerHTML(node *html.Node) []byte {
+	var buffer bytes.Buffer
+	err := html.Render(&buffer, node)
+	if err != nil {
+		return []byte{}
+	}
+	return buffer.Bytes()
+}
+
+// innerHTML returns the HTML content (inner HTML) of an element.
+func innerHTML(node *html.Node) string {
+	var err error
+	var buffer bytes.Buffer
+
+	for child := node.FirstChild; child != nil; child = child.NextSibling {
+		err = html.Render(&buffer, child)
+		if err != nil {
+			return ""
+		}
+	}
+
+	return strings.TrimSpace(buffer.String())
+}
+
+// documentElement returns the Element that is the root element
+// of the document. Since we are working with HTML document,
+// the root will be <html> element for HTML documents).
+func documentElement(doc *html.Node) *html.Node {
+	if nodes := getElementsByTagName(doc, "html"); len(nodes) > 0 {
+		return nodes[0]
+	}
+	return nil
+}
+
+// id returns the value of the id attribute of the specified element.
+func id(node *html.Node) string {
+	id := getAttribute(node, "id")
+	id = strings.TrimSpace(id)
+	return id
+}
+
+// className returns the value of the class attribute of
+// the specified element.
+func className(node *html.Node) string {
+	className := getAttribute(node, "class")
+	className = strings.TrimSpace(className)
+	className = strings.Join(strings.Fields(className), " ")
+	return className
+}
+
+// children returns an HTMLCollection of the child elements of Node.
+func children(node *html.Node) []*html.Node {
+	var children []*html.Node
+	if node == nil {
+		return nil
+	}
+
+	for child := node.FirstChild; child != nil; child = child.NextSibling {
+		if child.Type == html.ElementNode {
+			children = append(children, child)
+		}
+	}
+	return children
+}
+
+// childNodes returns list of a node's direct children.
+func childNodes(node *html.Node) []*html.Node {
+	var childNodes []*html.Node
+	for child := node.FirstChild; child != nil; child = child.NextSibling {
+		childNodes = append(childNodes, child)
+	}
+	return childNodes
+}
+
+// firstElementChild returns the object's first child Element,
+// or nil if there are no child elements.
+func firstElementChild(node *html.Node) *html.Node {
+	for child := node.FirstChild; child != nil; child = child.NextSibling {
+		if child.Type == html.ElementNode {
+			return child
+		}
+	}
+	return nil
+}
+
+// nextElementSibling returns the Element immediately following
+// the specified one in its parent's children list, or nil if the
+// specified Element is the last one in the list.
+func nextElementSibling(node *html.Node) *html.Node {
+	for sibling := node.NextSibling; sibling != nil; sibling = sibling.NextSibling {
+		if sibling.Type == html.ElementNode {
+			return sibling
+		}
+	}
+	return nil
+}
+
+// appendChild adds a node to the end of the list of children of a
+// specified parent node. If the given child is a reference to an
+// existing node in the document, appendChild() moves it from its
+// current position to the new position.
+func appendChild(node *html.Node, child *html.Node) {
+	if child.Parent != nil {
+		temp := cloneNode(child)
+		node.AppendChild(temp)
+		child.Parent.RemoveChild(child)
+	} else {
+		node.AppendChild(child)
+	}
+}
+
+// replaceNode replaces an OldNode with a NewNode.
+func replaceNode(oldNode *html.Node, newNode *html.Node) {
+	if oldNode.Parent == nil {
+		return
+	}
+
+	newNode.Parent = nil
+	newNode.PrevSibling = nil
+	newNode.NextSibling = nil
+	oldNode.Parent.InsertBefore(newNode, oldNode)
+	oldNode.Parent.RemoveChild(oldNode)
+}
+
+// includeNode determines if node is included inside nodeList.
+func includeNode(nodeList []*html.Node, node *html.Node) bool {
+	for i := 0; i < len(nodeList); i++ {
+		if nodeList[i] == node {
+			return true
+		}
+	}
+	return false
+}
+
+// cloneNode returns a deep clone of the node and its children.
+// However, it will be detached from the original's parents
+// and siblings.
+func cloneNode(src *html.Node) *html.Node {
+	clone := &html.Node{
+		Type:     src.Type,
+		DataAtom: src.DataAtom,
+		Data:     src.Data,
+		Attr:     make([]html.Attribute, len(src.Attr)),
+	}
+
+	copy(clone.Attr, src.Attr)
+	for child := src.FirstChild; child != nil; child = child.NextSibling {
+		clone.AppendChild(cloneNode(child))
+	}
+
+	return clone
+}
+
+func getAllNodesWithTag(node *html.Node, tagNames ...string) []*html.Node {
+	var result []*html.Node
+	for i := 0; i < len(tagNames); i++ {
+		result = append(result, getElementsByTagName(node, tagNames[i])...)
+	}
+	return result
+}
+
+// forEachNode iterates over a NodeList and runs fn on each node.
+func forEachNode(nodeList []*html.Node, fn func(*html.Node, int)) {
+	for i := 0; i < len(nodeList); i++ {
+		fn(nodeList[i], i)
+	}
+}
+
+// removeNodes iterates over a NodeList, calls `filterFn` for each node
+// and removes node if function returned `true`. If function is not
+// passed, removes all the nodes in node list.
+func removeNodes(nodeList []*html.Node, filterFn func(*html.Node) bool) {
+	for i := len(nodeList) - 1; i >= 0; i-- {
+		node := nodeList[i]
+		parentNode := node.Parent
+		if parentNode != nil && (filterFn == nil || filterFn(node)) {
+			parentNode.RemoveChild(node)
+		}
+	}
+}
+
+// setTextContent sets the text content of the specified node.
+func setTextContent(node *html.Node, text string) {
+	for child := node.FirstChild; child != nil; child = child.NextSibling {
+		if child.Parent != nil {
+			child.Parent.RemoveChild(child)
+		}
+	}
+
+	node.AppendChild(&html.Node{
+		Type: html.TextNode,
+		Data: text,
+	})
+}
--- a/pkg/warc/internal/archiver/utils.go
+++ b/pkg/warc/internal/archiver/utils.go
@ -0,0 +1,54 @@
+package archiver
+
+import (
+	nurl "net/url"
+	"regexp"
+	"strings"
+)
+
+var (
+	rxStyleURL      = regexp.MustCompile(`(?i)^url\((.+)\)$`)
+	rxSingleQuote   = regexp.MustCompile(`(?i)^'(.*)'$`)
+	rxDoubleQuote   = regexp.MustCompile(`(?i)^"(.*)"$`)
+	rxJSContentType = regexp.MustCompile(`(?i)(text|application)/(java|ecma)script`)
+)
+
+func clearUTMParams(url *nurl.URL) {
+	queries := url.Query()
+
+	for key := range queries {
+		if strings.HasPrefix(key, "utm_") {
+			queries.Del(key)
+		}
+	}
+
+	url.RawQuery = queries.Encode()
+}
+
+// toAbsoluteURI convert uri to absolute path based on base.
+// However, if uri is prefixed with hash (#), the uri won't be changed.
+func toAbsoluteURI(uri string, base *nurl.URL) string {
+	if uri == "" || base == nil {
+		return ""
+	}
+
+	// If it is hash tag, return as it is
+	if uri[:1] == "#" {
+		return uri
+	}
+
+	// If it is already an absolute URL, return as it is
+	tmp, err := nurl.ParseRequestURI(uri)
+	if err == nil && tmp.Scheme != "" && tmp.Hostname() != "" {
+		return uri
+	}
+
+	// Otherwise, resolve against base URI.
+	tmp, err = nurl.Parse(uri)
+	if err != nil {
+		return uri
+	}
+
+	clearUTMParams(tmp)
+	return base.ResolveReference(tmp).String()
+}
--- a/pkg/warc/reader.go
+++ b/pkg/warc/reader.go
@ -0,0 +1,76 @@
+package warc
+
+import (
+	"fmt"
+	"os"
+
+	"go.etcd.io/bbolt"
+)
+
+// Archive is the storage for archiving the web page.
+type Archive struct {
+	db *bbolt.DB
+}
+
+// Open opens the archive from specified path.
+func Open(path string) (*Archive, error) {
+	// Make sure archive exists
+	info, err := os.Stat(path)
+	if os.IsNotExist(err) || info.IsDir() {
+		return nil, fmt.Errorf("archive doesn't exist")
+	}
+
+	// Open database
+	options := &bbolt.Options{
+		ReadOnly: true,
+	}
+
+	db, err := bbolt.Open(path, os.ModePerm, options)
+	if err != nil {
+		return nil, err
+	}
+
+	return &Archive{db: db}, nil
+}
+
+// Close closes the storage.
+func (arc *Archive) Close() {
+	arc.db.Close()
+}
+
+// Read fetch the resource with specified name from archive.
+func (arc *Archive) Read(name string) ([]byte, string, error) {
+	// Make sure name exists
+	if name == "" {
+		name = "archive-root"
+	}
+
+	var content []byte
+	var strContentType string
+
+	err := arc.db.View(func(tx *bbolt.Tx) error {
+		bucket := tx.Bucket([]byte(name))
+		if bucket == nil {
+			return fmt.Errorf("%s doesn't exist", name)
+		}
+
+		contentType := bucket.Get([]byte("type"))
+		if contentType == nil {
+			return fmt.Errorf("%s doesn't exist", name)
+		}
+		strContentType = string(contentType)
+
+		content = bucket.Get([]byte("content"))
+		if content == nil {
+			return fmt.Errorf("%s doesn't exist", name)
+		}
+
+		return nil
+	})
+
+	if err != nil {
+		return nil, "", err
+	}
+
+	return content, strContentType, nil
+}
--- a/pkg/warc/writer.go
+++ b/pkg/warc/writer.go
@ -0,0 +1,105 @@
+package warc
+
+import (
+	"fmt"
+	"io"
+	nurl "net/url"
+	"os"
+	fp "path/filepath"
+	"strings"
+	"time"
+
+	"github.com/go-shiori/shiori/pkg/warc/internal/archiver"
+	"go.etcd.io/bbolt"
+)
+
+// FromReader create archive from the specified io.Reader.
+func FromReader(input io.Reader, url, contentType, dstPath string) error {
+	// Make sure URL is valid
+	parsedURL, err := nurl.ParseRequestURI(url)
+	if err != nil || parsedURL.Scheme == "" || parsedURL.Hostname() == "" {
+		return fmt.Errorf("url %s is not valid", url)
+	}
+
+	// Generate resource URL
+	res := archiver.ToResourceURL(url, parsedURL)
+	res.ArchivalURL = "archive-root"
+
+	// Create database for archive
+	os.MkdirAll(fp.Dir(dstPath), os.ModePerm)
+
+	db, err := bbolt.Open(dstPath, os.ModePerm, nil)
+	if err != nil {
+		return fmt.Errorf("failed to create archive: %v", err)
+	}
+
+	// Create archiver
+	arc := &archiver.Archiver{
+		DB:          db,
+		ChDone:      make(chan struct{}),
+		ChErrors:    make(chan error),
+		ChWarnings:  make(chan error),
+		ChRequest:   make(chan archiver.ResourceURL, 10),
+		ResourceMap: make(map[string]struct{}),
+		LogEnabled:  true,
+	}
+	defer arc.Close()
+
+	// Process input depending on its type.
+	// If it's HTML, we need to extract the sub resources that used by it, e.g some CSS or JS files.
+	// If it's not HTML, we can just save it to archive.
+	var result archiver.ProcessResult
+	var subResources []archiver.ResourceURL
+
+	if strings.Contains(contentType, "text/html") {
+		result, subResources, err = arc.ProcessHTMLFile(res, input)
+	} else {
+		result, err = arc.ProcessOtherFile(res, input)
+	}
+
+	if err != nil {
+		return fmt.Errorf("archival failed: %v", err)
+	}
+
+	// Add this url to resource map to mark it as processed
+	arc.ResourceMap[res.DownloadURL] = struct{}{}
+
+	// Save content to storage
+	arc.Logf(0, "Downloaded %s", res.DownloadURL)
+
+	result.ContentType = contentType
+	err = arc.SaveToStorage(result)
+	if err != nil {
+		return fmt.Errorf("failed to save %s: %v", res.DownloadURL, err)
+	}
+
+	// If there are no sub resources found, our job is finished.
+	if len(subResources) == 0 {
+		return nil
+	}
+
+	// However, if there are, we need to run the archiver in background to
+	// process the sub resources concurrently.
+	go func() {
+		for _, subRes := range subResources {
+			arc.ChRequest <- subRes
+		}
+	}()
+
+	time.Sleep(time.Second)
+	arc.StartArchiver()
+	return nil
+}
+
+// FromURL create archive from the specified URL.
+func FromURL(url, dstPath string) error {
+	// Download URL
+	resp, err := archiver.DownloadData(url)
+	if err != nil {
+		return fmt.Errorf("failed to download %s: %v", url, err)
+	}
+	defer resp.Body.Close()
+
+	contentType := resp.Header.Get("Content-Type")
+	return FromReader(resp.Body, url, contentType, dstPath)
+}