shelfzilla/crawler/crawler/settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for crawler project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#

BOT_NAME = 'crawler'

SPIDER_MODULES = ['crawler.spiders']
NEWSPIDER_MODULE = 'crawler.spiders'

ITEM_PIPELINES = {
    # Check duplicate
    # Determine if other language than Spanish (drop them if is)
    # Clean fields
    'crawler.pipelines.DuplicatesPipeline': 100,
    'crawler.pipelines.CheckLanguagePipeline': 200,
    'crawler.pipelines.CleanFieldsPipeline': 300,
    # ...
    'scrapycouchdb.CouchDBPipeline': 1000,
}


COUCHDB_SERVER = 'http://127.0.0.1:5984/'
COUCHDB_DB = 'norma'
COUCHDB_UNIQ_KEY = 'uuid'
COUCHDB_IGNORE_FIELDS = []

LOG_LEVEL = 'ERROR'
LOG_FILE = 'scrapy.log'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'Test Crawler (+http://www.yourdomain.com)'