Scrapy Catch Ignorerequest and Continue With Next Url
Python scrapy.exceptions.IgnoreRequest() Examples
The following are 30 code examples of scrapy.exceptions.IgnoreRequest() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scrapy.exceptions , or try the search function .
Example #1
def _redirect(self, redirected, request, spider, reason): ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times) redirects = request.meta.get('redirect_times', 0) + 1 if ttl and redirects <= self.max_redirect_times: redirected.meta['redirect_times'] = redirects redirected.meta['redirect_ttl'] = ttl - 1 redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \ [request.url] redirected.meta['redirect_reasons'] = request.meta.get('redirect_reasons', []) + \ [reason] redirected.dont_filter = request.dont_filter redirected.priority = request.priority + self.priority_adjust logger.debug("Redirecting (%(reason)s) to %(redirected)s from %(request)s", {'reason': reason, 'redirected': redirected, 'request': request}, extra={'spider': spider}) return redirected else: logger.debug("Discarding %(request)s: max redirections reached", {'request': request}, extra={'spider': spider}) raise IgnoreRequest("max redirections reached")
Example #2
def fetch(self, request_or_url, spider=None, redirect=True, **kwargs): if isinstance(request_or_url, Request): request = request_or_url else: url = any_to_uri(request_or_url) request = Request(url, dont_filter=True, **kwargs) if redirect: request.meta['handle_httpstatus_list'] = SequenceExclude(range(300, 400)) else: request.meta['handle_httpstatus_all'] = True response = None try: response, spider = threads.blockingCallFromThread( reactor, self._schedule, request, spider) except IgnoreRequest: pass self.populate_vars(response, request, spider)
Example #3
def _log_download_errors(self, spider_failure, download_failure, request, spider): """Log and silence errors that come from the engine (typically download errors that got propagated thru here) """ if (isinstance(download_failure, Failure) and not download_failure.check(IgnoreRequest)): if download_failure.frames: logger.error('Error downloading %(request)s', {'request': request}, exc_info=failure_to_exc_info(download_failure), extra={'spider': spider}) else: errmsg = download_failure.getErrorMessage() if errmsg: logger.error('Error downloading %(request)s: %(errmsg)s', {'request': request, 'errmsg': errmsg}, extra={'spider': spider}) if spider_failure is not download_failure: return spider_failure
Example #4
def _redirect(self, redirected, request, spider, reason): ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times) redirects = request.meta.get('redirect_times', 0) + 1 if ttl and redirects <= self.max_redirect_times: redirected.meta['redirect_times'] = redirects redirected.meta['redirect_ttl'] = ttl - 1 redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \ [request.url] redirected.meta['redirect_reasons'] = request.meta.get('redirect_reasons', []) + \ [reason] redirected.dont_filter = request.dont_filter redirected.priority = request.priority + self.priority_adjust logger.debug("Redirecting (%(reason)s) to %(redirected)s from %(request)s", {'reason': reason, 'redirected': redirected, 'request': request}, extra={'spider': spider}) return redirected else: logger.debug("Discarding %(request)s: max redirections reached", {'request': request}, extra={'spider': spider}) raise IgnoreRequest("max redirections reached")
Example #5
def fetch(self, request_or_url, spider=None, redirect=True, **kwargs): if isinstance(request_or_url, Request): request = request_or_url else: url = any_to_uri(request_or_url) request = Request(url, dont_filter=True, **kwargs) if redirect: request.meta['handle_httpstatus_list'] = SequenceExclude(range(300, 400)) else: request.meta['handle_httpstatus_all'] = True response = None try: response, spider = threads.blockingCallFromThread( reactor, self._schedule, request, spider) except IgnoreRequest: pass self.populate_vars(response, request, spider)
Example #6
def process_exception(self, request, exception, spider): if isinstance(exception, (IgnoreRequest, DropItem)): return if not self._is_enabled_for_request(request): return autoextract = request.meta.pop(AUTOEXTRACT_META_KEY) stop_time = time.time() latency = time.time() - autoextract['timing']['start_ts'] autoextract['timing'].update({'end_ts': stop_time, 'latency': latency}) # Make sure to log all unknown failures logger.warning('AutoExtract failure after %.3fs for %s: %s', latency, autoextract['original_url'], repr(exception), extra={'spider': spider}) request.meta['autoextract'] = autoextract ex_class = global_object_name(exception.__class__) self.inc_metric('autoextract/errors/total_count', spider=spider) self.inc_metric('autoextract/errors/type_count/%s' % ex_class, spider=spider)
Example #7
def process_request(self, request, spider): # don't use this middleware while testing is site is up if hasattr(spider, "test") and spider.test=="yes": #logger = logging.getLogger() #logger.info("Testing mode, dead domains disabled") return None if not Domain.is_onion_url(request.url): return None domain = Domain.find_by_url(request.url) if not domain or domain.is_up: return None raise IgnoreRequest('Domain %s is dead, skipping' % domain.host)
Example #8
def process_request(self, request, spider): parsed_url = urlparse.urlparse(request.url) if not self.test_mode or not parsed_url.path in ["/", ""]: return None if not Domain.is_onion_url(request.url): return None d = Domain.find_by_url(request.url) if d is None: return None now = datetime.now() if now > d.next_scheduled_check: return None else: raise IgnoreRequest('FilterNotScheduledMiddleware: %s is not scheduled to check' % d.host)
Example #9
def process_request(self, request, spider): if 'x-ignore-request' in request.url: raise IgnoreRequest() elif 'x-error-request' in request.url: _ = 1 / 0
Example #10
def process_response(self, request, response, spider): if 'x-ignore-response' in request.url: raise IgnoreRequest() elif 'x-error-response' in request.url: _ = 1 / 0 else: return response
Example #11
def process_request(self, request, spider): if not request.url: return None channel_id = request.meta.get('channel_id', 0) # 处理详情页面(忽略列表页面)与pipeline配合 if is_dup_detail(request.url, spider.name, channel_id): raise IgnoreRequest("Spider: %s, DeDuplicationRequest: %s" % (spider.name, request.url))
Example #12
def process_request(self, request, spider): # 处理微信反爬(反爬机制一, sogou) if spider.name in ['weixin'] and 'antispider' in request.url: # 获取来源链接 redirect_urls = request.meta['redirect_urls'] # 清理失效 cookies cookies_id = request.meta['cookiejar'] del_cookies(spider.name, cookies_id) # spider.log(message='AntiSpider cookies_id: %s; url: %s' % (cookies_id, redirect_urls[0])) raise IgnoreRequest( 'Spider: %s, AntiSpider cookies_id: %s; url: %s' % (spider.name, cookies_id, redirect_urls[0]))
Example #13
def mustbe_deferred(f, *args, **kw): """Same as twisted.internet.defer.maybeDeferred, but delay calling callback/errback to next reactor loop """ try: result = f(*args, **kw) # FIXME: Hack to avoid introspecting tracebacks. This to speed up # processing of IgnoreRequest errors which are, by far, the most common # exception in Scrapy - see #125 except IgnoreRequest as e: return defer_fail(failure.Failure(e)) except Exception: return defer_fail(failure.Failure()) else: return defer_result(result)
Example #14
def process_request_2(self, rp, request, spider): if rp is None: return if not rp.can_fetch(to_native_str(self._useragent), request.url): logger.debug("Forbidden by robots.txt: %(request)s", {'request': request}, extra={'spider': spider}) self.crawler.stats.inc_value('robotstxt/forbidden') raise IgnoreRequest("Forbidden by robots.txt")
Example #15
def _logerror(self, failure, request, spider): if failure.type is not IgnoreRequest: logger.error("Error downloading %(request)s: %(f_exception)s", {'request': request, 'f_exception': failure.value}, exc_info=failure_to_exc_info(failure), extra={'spider': spider}) return failure
Example #16
def _robots_error(self, failure, netloc): if failure.type is not IgnoreRequest: key = 'robotstxt/exception_count/{}'.format(failure.type) self.crawler.stats.inc_value(key) rp_dfd = self._parsers[netloc] self._parsers[netloc] = None rp_dfd.callback(None)
Example #17
def media_failed(self, failure, request, info): if not isinstance(failure.value, IgnoreRequest): referer = referer_str(request) logger.warning( 'File (unknown-error): Error downloading %(medianame)s from ' '%(request)s referred in <%(referer)s>: %(exception)s', {'medianame': self.MEDIA_NAME, 'request': request, 'referer': referer, 'exception': failure.value}, extra={'spider': info.spider} ) raise FileException
Example #18
def mustbe_deferred(f, *args, **kw): """Same as twisted.internet.defer.maybeDeferred, but delay calling callback/errback to next reactor loop """ try: result = f(*args, **kw) # FIXME: Hack to avoid introspecting tracebacks. This to speed up # processing of IgnoreRequest errors which are, by far, the most common # exception in Scrapy - see #125 except IgnoreRequest as e: return defer_fail(failure.Failure(e)) except Exception: return defer_fail(failure.Failure()) else: return defer_result(result)
Example #19
def _logerror(self, failure, request, spider): if failure.type is not IgnoreRequest: logger.error("Error downloading %(request)s: %(f_exception)s", {'request': request, 'f_exception': failure.value}, exc_info=failure_to_exc_info(failure), extra={'spider': spider}) return failure
Example #20
def _robots_error(self, failure, netloc): if failure.type is not IgnoreRequest: key = 'robotstxt/exception_count/{}'.format(failure.type) self.crawler.stats.inc_value(key) rp_dfd = self._parsers[netloc] self._parsers[netloc] = None rp_dfd.callback(None)
Example #21
def process_request(self, request, spider): if request.meta.get('dont_cache', False): return # Skip uncacheable requests if not self.policy.should_cache_request(request): request.meta['_dont_cache'] = True # flag as uncacheable return # Look for cached response and check if expired cachedresponse = self.storage.retrieve_response(spider, request) if cachedresponse is None: self.stats.inc_value('httpcache/miss', spider=spider) if self.ignore_missing: self.stats.inc_value('httpcache/ignore', spider=spider) raise IgnoreRequest("Ignored request not in cache: %s" % request) return # first time request # Return cached response only if not expired cachedresponse.flags.append('cached') if self.policy.is_cached_response_fresh(cachedresponse, request): self.stats.inc_value('httpcache/hit', spider=spider) return cachedresponse # Keep a reference to cached response to avoid a second cache lookup on # process_response hook request.meta['cached_response'] = cachedresponse
Example #22
def media_failed(self, failure, request, info): if not isinstance(failure.value, IgnoreRequest): referer = referer_str(request) logger.warning( 'File (unknown-error): Error downloading %(medianame)s from ' '%(request)s referred in <%(referer)s>: %(exception)s', {'medianame': self.MEDIA_NAME, 'request': request, 'referer': referer, 'exception': failure.value}, extra={'spider': info.spider} ) raise FileException
Example #23
def process_request(self, request, spider): if request.url not in spider.start_urls and (redis_conn.hexists(redis_url_key, request.url) or redis_conn.hexists(redis_invalid_url_key, request.url)): logger.info("Skip URL: %s, has been crawled" % request.url) raise IgnoreRequest("URL %s has been crawled" % request.url)
Example #24
def process_request(self, request, spider): if not request.meta.get('crawl_once', self.default): return if self._get_key(request) in self.db: self.stats.inc_value('crawl_once/ignored') raise IgnoreRequest()
Example #25
def process_spider_exception(self, response, exception, spider): if (self.on_error_enabled and not isinstance(exception, IgnoreRequest) and self.counters['error'] < self.limits['error']): self.counters['error'] += 1 self.save_response(response, spider)
Example #26
def test_process_spider_exception(self): assert self.instance.counters == {'all': 0, 'error': 0} self.instance.save_response = mock.Mock() # all conditions are true self.instance.on_error_enabled = True self.instance.process_spider_exception( 'err-response', Exception(), self.spider) assert self.instance.counters == {'all': 0, 'error': 1} # on_error flag is disabled, skipping self.instance.on_error_enabled = False self.instance.process_spider_exception( 'err-response', Exception(), self.spider) assert self.instance.counters == {'all': 0, 'error': 1} # exceeded error limit self.instance.on_error_enabled = True self.instance.counters['error'] = 11 self.instance.process_spider_exception( 'err-response', Exception(), self.spider) assert self.instance.counters == {'all': 0, 'error': 11} # skip IgnoreRequest self.instance.limits['error'] = 12 self.instance.process_spider_exception( 'err-response', IgnoreRequest(), self.spider) assert self.instance.counters == {'all': 0, 'error': 11} # all conditions are true again self.instance.limits['all'] = 12 self.instance.process_spider_exception( 'err-response', Exception(), self.spider) assert self.instance.counters == {'all': 0, 'error': 12}
Example #27
def process_request(self, request, spider): if not Domain.is_onion_url(request.url): return None parsed_url = urlparse.urlparse(request.url) host = parsed_url.hostname subdomains = host.count(".") if subdomains > 2: raise IgnoreRequest('Too many subdomains (%d > 2)' % subdomains) return None
Example #28
def process_request(self, request, spider): parsed_url = urlparse.urlparse(request.url) host = parsed_url.hostname if self.counter[host] < self.max_pages: self.counter[host] += 1 spider.logger.info('Page count is %d for %s' % (self.counter[host], host)) return None else: raise IgnoreRequest('MAX_PAGES_PER_DOMAIN reached, filtered %s' % request.url)
Example #29
def test_middleware(): Rq = lambda path: Request( 'http://example.com{}'.format(path), meta={'avoid_dup_content': True}) Rs = lambda req, body: HtmlResponse( req.url, body=body.encode(), request=req) mw = AvoidDupContentMiddleware( initial_queue_limit=1, threshold=0.5, exploration=0.00) spider = Spider() req = Rq('/') mw.process_request(req, spider) mw.process_response(req, Rs(req, ''), spider) assert mw.dupe_predictor n_dropped = 0 for i in range(10): req = Rq('/viewtopic.php?topic_id={}'.format(i)) mw.process_request(req, spider) mw.process_response(req, Rs(req, 'Topic {}'.format(i)), spider) req = Rq('/viewtopic.php?topic_id={}&start=0'.format(i)) try: mw.process_request(req, spider) except IgnoreRequest: n_dropped += 1 else: mw.process_response(req, Rs(req, 'Topic {}'.format(i)), spider) mw.dupe_predictor.log_dupstats(min_dup=0) assert n_dropped == 5 # one request in different order req = Rq('/viewtopic.php?topic_id=100&start=0') mw.process_request(req, spider) mw.process_response(req, Rs(req, ''), spider) mw.process_request(Rq('/viewtopic.php?topic_id=200'), spider) with pytest.raises(IgnoreRequest): mw.process_request(Rq('/viewtopic.php?topic_id=100'), spider) # test exploration mw.exploration = 0.5 n_dropped = 0 n_requests = 0 for i in range(150, 170): req = Rq('/viewtopic.php?topic_id={}'.format(i)) mw.process_request(req, spider) mw.process_response(req, Rs(req, 'Topic {}'.format(i)), spider) req = Rq('/viewtopic.php?topic_id={}&start=0'.format(i)) n_requests += 1 try: mw.process_request(req, spider) except IgnoreRequest: n_dropped += 1 else: mw.process_response(req, Rs(req, 'Topic {}'.format(i)), spider) assert n_dropped > 0 assert n_dropped < n_requests
Example #30
def test_crawl(tmpdir): settings = {'CRAWL_ONCE_PATH': str(tmpdir)} crawler = get_crawler(settings_dict=settings) req1 = scrapy.Request('http://example.com/1', meta={'crawl_once': True}) req2 = scrapy.Request('http://example.com/2') req3 = scrapy.Request('http://example.com/3', meta={'crawl_once': True}) resp1 = Response(req1.url, request=req1) resp2 = Response(req2.url, request=req2) with opened_middleware(crawler) as mw: # 1. check spider middleware interface assert len(mw.db) == 0 assert crawler.stats.get_value('crawl_once/initial') == 0 output = [{}, scrapy.Request('http://example.com')] # crawl_once is False res = list(mw.process_spider_output(resp2, output, crawler.spider)) assert res == output assert len(mw.db) == 0 # crawl_once is True res = list(mw.process_spider_output(resp1, output, crawler.spider)) assert res == output assert len(mw.db) == 1 assert crawler.stats.get_value('crawl_once/initial') == 0 assert crawler.stats.get_value('crawl_once/stored') == 1 # 2. check downloader middleware interface assert mw.process_request(req2, crawler.spider) is None assert crawler.stats.get_value('crawl_once/ignored', 0) == 0 with pytest.raises(IgnoreRequest): mw.process_request(req1, crawler.spider) assert crawler.stats.get_value('crawl_once/ignored', 0) == 1 assert mw.process_request(req3, crawler.spider) is None assert crawler.stats.get_value('crawl_once/ignored', 0) == 1 assert crawler.stats.get_value('crawl_once/initial') == 0 crawler = get_crawler(settings_dict=settings) with opened_middleware(crawler) as mw2: # it reuses the same file, so there are records assert len(mw2.db) == 1 assert crawler.stats.get_value('crawl_once/initial') == 1 assert mw2.process_request(req2, crawler.spider) is None assert crawler.stats.get_value('crawl_once/ignored', 0) == 0 with pytest.raises(IgnoreRequest): mw2.process_request(req1, crawler.spider) assert crawler.stats.get_value('crawl_once/ignored', 0) == 1 assert mw2.process_request(req3, crawler.spider) is None
Source: https://www.programcreek.com/python/example/84839/scrapy.exceptions.IgnoreRequest
0 Response to "Scrapy Catch Ignorerequest and Continue With Next Url"
Post a Comment