Scrapy Catch Ignorerequest and Continue With Next Url

Python scrapy.exceptions.IgnoreRequest() Examples

The following are 30 code examples of scrapy.exceptions.IgnoreRequest() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scrapy.exceptions , or try the search function .

Example #1

def _redirect(self, redirected, request, spider, reason):         ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times)         redirects = request.meta.get('redirect_times', 0) + 1          if ttl and redirects <= self.max_redirect_times:             redirected.meta['redirect_times'] = redirects             redirected.meta['redirect_ttl'] = ttl - 1             redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \                 [request.url]             redirected.meta['redirect_reasons'] = request.meta.get('redirect_reasons', []) + \                 [reason]             redirected.dont_filter = request.dont_filter             redirected.priority = request.priority + self.priority_adjust             logger.debug("Redirecting (%(reason)s) to %(redirected)s from %(request)s",                          {'reason': reason, 'redirected': redirected, 'request': request},                          extra={'spider': spider})             return redirected         else:             logger.debug("Discarding %(request)s: max redirections reached",                          {'request': request}, extra={'spider': spider})             raise IgnoreRequest("max redirections reached")          

Example #2

def fetch(self, request_or_url, spider=None, redirect=True, **kwargs):         if isinstance(request_or_url, Request):             request = request_or_url         else:             url = any_to_uri(request_or_url)             request = Request(url, dont_filter=True, **kwargs)             if redirect:                 request.meta['handle_httpstatus_list'] = SequenceExclude(range(300, 400))             else:                 request.meta['handle_httpstatus_all'] = True         response = None         try:             response, spider = threads.blockingCallFromThread(                 reactor, self._schedule, request, spider)         except IgnoreRequest:             pass         self.populate_vars(response, request, spider)          

Example #3

def _log_download_errors(self, spider_failure, download_failure, request, spider):         """Log and silence errors that come from the engine (typically download         errors that got propagated thru here)         """         if (isinstance(download_failure, Failure) and                 not download_failure.check(IgnoreRequest)):             if download_failure.frames:                 logger.error('Error downloading %(request)s',                              {'request': request},                              exc_info=failure_to_exc_info(download_failure),                              extra={'spider': spider})             else:                 errmsg = download_failure.getErrorMessage()                 if errmsg:                     logger.error('Error downloading %(request)s: %(errmsg)s',                                  {'request': request, 'errmsg': errmsg},                                  extra={'spider': spider})          if spider_failure is not download_failure:             return spider_failure          

Example #4

def _redirect(self, redirected, request, spider, reason):         ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times)         redirects = request.meta.get('redirect_times', 0) + 1          if ttl and redirects <= self.max_redirect_times:             redirected.meta['redirect_times'] = redirects             redirected.meta['redirect_ttl'] = ttl - 1             redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \                 [request.url]             redirected.meta['redirect_reasons'] = request.meta.get('redirect_reasons', []) + \                 [reason]             redirected.dont_filter = request.dont_filter             redirected.priority = request.priority + self.priority_adjust             logger.debug("Redirecting (%(reason)s) to %(redirected)s from %(request)s",                          {'reason': reason, 'redirected': redirected, 'request': request},                          extra={'spider': spider})             return redirected         else:             logger.debug("Discarding %(request)s: max redirections reached",                          {'request': request}, extra={'spider': spider})             raise IgnoreRequest("max redirections reached")          

Example #5

def fetch(self, request_or_url, spider=None, redirect=True, **kwargs):         if isinstance(request_or_url, Request):             request = request_or_url         else:             url = any_to_uri(request_or_url)             request = Request(url, dont_filter=True, **kwargs)             if redirect:                 request.meta['handle_httpstatus_list'] = SequenceExclude(range(300, 400))             else:                 request.meta['handle_httpstatus_all'] = True         response = None         try:             response, spider = threads.blockingCallFromThread(                 reactor, self._schedule, request, spider)         except IgnoreRequest:             pass         self.populate_vars(response, request, spider)          

Example #6

def process_exception(self, request, exception, spider):         if isinstance(exception, (IgnoreRequest, DropItem)):             return         if not self._is_enabled_for_request(request):             return          autoextract = request.meta.pop(AUTOEXTRACT_META_KEY)         stop_time = time.time()         latency = time.time() - autoextract['timing']['start_ts']         autoextract['timing'].update({'end_ts': stop_time, 'latency': latency})          # Make sure to log all unknown failures         logger.warning('AutoExtract failure after %.3fs for %s: %s',                        latency,                        autoextract['original_url'],                        repr(exception),                        extra={'spider': spider})          request.meta['autoextract'] = autoextract         ex_class = global_object_name(exception.__class__)         self.inc_metric('autoextract/errors/total_count', spider=spider)         self.inc_metric('autoextract/errors/type_count/%s' % ex_class, spider=spider)          

Example #7

def process_request(self, request, spider):            # don't use this middleware while testing is site is up         if hasattr(spider, "test") and spider.test=="yes":             #logger = logging.getLogger()             #logger.info("Testing mode, dead domains disabled")             return None          if not Domain.is_onion_url(request.url):             return None          domain = Domain.find_by_url(request.url)         if not domain or domain.is_up:             return None          raise IgnoreRequest('Domain %s is dead, skipping' % domain.host)          

Example #8

def process_request(self, request, spider):          parsed_url = urlparse.urlparse(request.url)                  if not self.test_mode or not parsed_url.path in ["/", ""]:             return None          if not Domain.is_onion_url(request.url):             return None          d = Domain.find_by_url(request.url)          if d is None:             return None          now = datetime.now()          if now > d.next_scheduled_check:             return None         else:             raise IgnoreRequest('FilterNotScheduledMiddleware: %s is not scheduled to check' % d.host)          

Example #9

def process_request(self, request, spider):         if 'x-ignore-request' in request.url:             raise IgnoreRequest()         elif 'x-error-request' in request.url:             _ = 1 / 0          

Example #10

def process_response(self, request, response, spider):         if 'x-ignore-response' in request.url:             raise IgnoreRequest()         elif 'x-error-response' in request.url:             _ = 1 / 0         else:             return response          

Example #11

def process_request(self, request, spider):         if not request.url:             return None         channel_id = request.meta.get('channel_id', 0)         # 处理详情页面(忽略列表页面)与pipeline配合         if is_dup_detail(request.url, spider.name, channel_id):             raise IgnoreRequest("Spider: %s, DeDuplicationRequest: %s" % (spider.name, request.url))          

Example #12

def process_request(self, request, spider):         # 处理微信反爬(反爬机制一, sogou)         if spider.name in ['weixin'] and 'antispider' in request.url:             # 获取来源链接             redirect_urls = request.meta['redirect_urls']              # 清理失效 cookies             cookies_id = request.meta['cookiejar']             del_cookies(spider.name, cookies_id)              # spider.log(message='AntiSpider cookies_id: %s; url: %s' % (cookies_id, redirect_urls[0]))             raise IgnoreRequest(                 'Spider: %s, AntiSpider cookies_id: %s; url: %s' % (spider.name, cookies_id, redirect_urls[0]))          

Example #13

def mustbe_deferred(f, *args, **kw):     """Same as twisted.internet.defer.maybeDeferred, but delay calling     callback/errback to next reactor loop     """     try:         result = f(*args, **kw)     # FIXME: Hack to avoid introspecting tracebacks. This to speed up     # processing of IgnoreRequest errors which are, by far, the most common     # exception in Scrapy - see #125     except IgnoreRequest as e:         return defer_fail(failure.Failure(e))     except Exception:         return defer_fail(failure.Failure())     else:         return defer_result(result)          

Example #14

def process_request_2(self, rp, request, spider):         if rp is None:             return         if not rp.can_fetch(to_native_str(self._useragent), request.url):             logger.debug("Forbidden by robots.txt: %(request)s",                          {'request': request}, extra={'spider': spider})             self.crawler.stats.inc_value('robotstxt/forbidden')             raise IgnoreRequest("Forbidden by robots.txt")          

Example #15

def _logerror(self, failure, request, spider):         if failure.type is not IgnoreRequest:             logger.error("Error downloading %(request)s: %(f_exception)s",                          {'request': request, 'f_exception': failure.value},                          exc_info=failure_to_exc_info(failure),                          extra={'spider': spider})         return failure          

Example #16

def _robots_error(self, failure, netloc):         if failure.type is not IgnoreRequest:             key = 'robotstxt/exception_count/{}'.format(failure.type)             self.crawler.stats.inc_value(key)         rp_dfd = self._parsers[netloc]         self._parsers[netloc] = None         rp_dfd.callback(None)          

Example #17

def media_failed(self, failure, request, info):         if not isinstance(failure.value, IgnoreRequest):             referer = referer_str(request)             logger.warning(                 'File (unknown-error): Error downloading %(medianame)s from '                 '%(request)s referred in <%(referer)s>: %(exception)s',                 {'medianame': self.MEDIA_NAME, 'request': request,                  'referer': referer, 'exception': failure.value},                 extra={'spider': info.spider}             )          raise FileException          

Example #18

def mustbe_deferred(f, *args, **kw):     """Same as twisted.internet.defer.maybeDeferred, but delay calling     callback/errback to next reactor loop     """     try:         result = f(*args, **kw)     # FIXME: Hack to avoid introspecting tracebacks. This to speed up     # processing of IgnoreRequest errors which are, by far, the most common     # exception in Scrapy - see #125     except IgnoreRequest as e:         return defer_fail(failure.Failure(e))     except Exception:         return defer_fail(failure.Failure())     else:         return defer_result(result)          

Example #19

def _logerror(self, failure, request, spider):         if failure.type is not IgnoreRequest:             logger.error("Error downloading %(request)s: %(f_exception)s",                          {'request': request, 'f_exception': failure.value},                          exc_info=failure_to_exc_info(failure),                          extra={'spider': spider})         return failure          

Example #20

def _robots_error(self, failure, netloc):         if failure.type is not IgnoreRequest:             key = 'robotstxt/exception_count/{}'.format(failure.type)             self.crawler.stats.inc_value(key)         rp_dfd = self._parsers[netloc]         self._parsers[netloc] = None         rp_dfd.callback(None)          

Example #21

def process_request(self, request, spider):         if request.meta.get('dont_cache', False):             return          # Skip uncacheable requests         if not self.policy.should_cache_request(request):             request.meta['_dont_cache'] = True  # flag as uncacheable             return          # Look for cached response and check if expired         cachedresponse = self.storage.retrieve_response(spider, request)         if cachedresponse is None:             self.stats.inc_value('httpcache/miss', spider=spider)             if self.ignore_missing:                 self.stats.inc_value('httpcache/ignore', spider=spider)                 raise IgnoreRequest("Ignored request not in cache: %s" % request)             return  # first time request          # Return cached response only if not expired         cachedresponse.flags.append('cached')         if self.policy.is_cached_response_fresh(cachedresponse, request):             self.stats.inc_value('httpcache/hit', spider=spider)             return cachedresponse          # Keep a reference to cached response to avoid a second cache lookup on         # process_response hook         request.meta['cached_response'] = cachedresponse          

Example #22

def media_failed(self, failure, request, info):         if not isinstance(failure.value, IgnoreRequest):             referer = referer_str(request)             logger.warning(                 'File (unknown-error): Error downloading %(medianame)s from '                 '%(request)s referred in <%(referer)s>: %(exception)s',                 {'medianame': self.MEDIA_NAME, 'request': request,                  'referer': referer, 'exception': failure.value},                 extra={'spider': info.spider}             )          raise FileException          

Example #23

def process_request(self, request, spider):         if request.url not in spider.start_urls and (redis_conn.hexists(redis_url_key, request.url) or redis_conn.hexists(redis_invalid_url_key, request.url)):             logger.info("Skip URL: %s, has been crawled" % request.url)             raise IgnoreRequest("URL %s has been crawled" % request.url)          

Example #24

def process_request(self, request, spider):         if not request.meta.get('crawl_once', self.default):             return         if self._get_key(request) in self.db:             self.stats.inc_value('crawl_once/ignored')             raise IgnoreRequest()          

Example #25

def process_spider_exception(self, response, exception, spider):         if (self.on_error_enabled and                 not isinstance(exception, IgnoreRequest) and                 self.counters['error'] < self.limits['error']):             self.counters['error'] += 1             self.save_response(response, spider)          

Example #26

def test_process_spider_exception(self):         assert self.instance.counters == {'all': 0, 'error': 0}         self.instance.save_response = mock.Mock()         # all conditions are true         self.instance.on_error_enabled = True         self.instance.process_spider_exception(             'err-response', Exception(), self.spider)         assert self.instance.counters == {'all': 0, 'error': 1}         # on_error flag is disabled, skipping         self.instance.on_error_enabled = False         self.instance.process_spider_exception(             'err-response', Exception(), self.spider)         assert self.instance.counters == {'all': 0, 'error': 1}         # exceeded error limit         self.instance.on_error_enabled = True         self.instance.counters['error'] = 11         self.instance.process_spider_exception(             'err-response', Exception(), self.spider)         assert self.instance.counters == {'all': 0, 'error': 11}         # skip IgnoreRequest         self.instance.limits['error'] = 12         self.instance.process_spider_exception(             'err-response', IgnoreRequest(), self.spider)         assert self.instance.counters == {'all': 0, 'error': 11}         # all conditions are true again         self.instance.limits['all'] = 12         self.instance.process_spider_exception(             'err-response', Exception(), self.spider)         assert self.instance.counters == {'all': 0, 'error': 12}          

Example #27

def process_request(self, request, spider):          if not Domain.is_onion_url(request.url):             return None         parsed_url = urlparse.urlparse(request.url)         host = parsed_url.hostname         subdomains = host.count(".")         if subdomains > 2:             raise IgnoreRequest('Too many subdomains (%d > 2)' % subdomains)          return None          

Example #28

def process_request(self, request, spider):                  parsed_url = urlparse.urlparse(request.url)         host = parsed_url.hostname         if self.counter[host] < self.max_pages:             self.counter[host] += 1             spider.logger.info('Page count is %d for %s' % (self.counter[host], host))             return None                            else:             raise IgnoreRequest('MAX_PAGES_PER_DOMAIN reached, filtered %s' % request.url)          

Example #29

def test_middleware():     Rq = lambda path: Request(         'http://example.com{}'.format(path),         meta={'avoid_dup_content': True})     Rs = lambda req, body: HtmlResponse(         req.url, body=body.encode(), request=req)     mw = AvoidDupContentMiddleware(         initial_queue_limit=1, threshold=0.5, exploration=0.00)     spider = Spider()     req = Rq('/')     mw.process_request(req, spider)     mw.process_response(req, Rs(req, ''), spider)     assert mw.dupe_predictor     n_dropped = 0     for i in range(10):         req = Rq('/viewtopic.php?topic_id={}'.format(i))         mw.process_request(req, spider)         mw.process_response(req, Rs(req, 'Topic {}'.format(i)), spider)         req = Rq('/viewtopic.php?topic_id={}&start=0'.format(i))         try:             mw.process_request(req, spider)         except IgnoreRequest:             n_dropped += 1         else:             mw.process_response(req, Rs(req, 'Topic {}'.format(i)), spider)         mw.dupe_predictor.log_dupstats(min_dup=0)     assert n_dropped == 5     # one request in different order     req = Rq('/viewtopic.php?topic_id=100&start=0')     mw.process_request(req, spider)     mw.process_response(req, Rs(req, ''), spider)     mw.process_request(Rq('/viewtopic.php?topic_id=200'), spider)     with pytest.raises(IgnoreRequest):         mw.process_request(Rq('/viewtopic.php?topic_id=100'), spider)     # test exploration     mw.exploration = 0.5     n_dropped = 0     n_requests = 0     for i in range(150, 170):         req = Rq('/viewtopic.php?topic_id={}'.format(i))         mw.process_request(req, spider)         mw.process_response(req, Rs(req, 'Topic {}'.format(i)), spider)         req = Rq('/viewtopic.php?topic_id={}&start=0'.format(i))         n_requests += 1         try:             mw.process_request(req, spider)         except IgnoreRequest:             n_dropped += 1         else:             mw.process_response(req, Rs(req, 'Topic {}'.format(i)), spider)     assert n_dropped > 0     assert n_dropped < n_requests          

Example #30

def test_crawl(tmpdir):     settings = {'CRAWL_ONCE_PATH': str(tmpdir)}     crawler = get_crawler(settings_dict=settings)     req1 = scrapy.Request('http://example.com/1', meta={'crawl_once': True})     req2 = scrapy.Request('http://example.com/2')     req3 = scrapy.Request('http://example.com/3', meta={'crawl_once': True})      resp1 = Response(req1.url, request=req1)     resp2 = Response(req2.url, request=req2)      with opened_middleware(crawler) as mw:          # 1. check spider middleware interface         assert len(mw.db) == 0         assert crawler.stats.get_value('crawl_once/initial') == 0         output = [{}, scrapy.Request('http://example.com')]          # crawl_once is False         res = list(mw.process_spider_output(resp2, output, crawler.spider))         assert res == output         assert len(mw.db) == 0          # crawl_once is True         res = list(mw.process_spider_output(resp1, output, crawler.spider))         assert res == output         assert len(mw.db) == 1         assert crawler.stats.get_value('crawl_once/initial') == 0         assert crawler.stats.get_value('crawl_once/stored') == 1          # 2. check downloader middleware interface         assert mw.process_request(req2, crawler.spider) is None         assert crawler.stats.get_value('crawl_once/ignored', 0) == 0          with pytest.raises(IgnoreRequest):             mw.process_request(req1, crawler.spider)         assert crawler.stats.get_value('crawl_once/ignored', 0) == 1          assert mw.process_request(req3, crawler.spider) is None         assert crawler.stats.get_value('crawl_once/ignored', 0) == 1         assert crawler.stats.get_value('crawl_once/initial') == 0      crawler = get_crawler(settings_dict=settings)     with opened_middleware(crawler) as mw2:         # it reuses the same file, so there are records         assert len(mw2.db) == 1         assert crawler.stats.get_value('crawl_once/initial') == 1         assert mw2.process_request(req2, crawler.spider) is None         assert crawler.stats.get_value('crawl_once/ignored', 0) == 0         with pytest.raises(IgnoreRequest):             mw2.process_request(req1, crawler.spider)         assert crawler.stats.get_value('crawl_once/ignored', 0) == 1         assert mw2.process_request(req3, crawler.spider) is None          

davisrequit.blogspot.com

Source: https://www.programcreek.com/python/example/84839/scrapy.exceptions.IgnoreRequest

0 Response to "Scrapy Catch Ignorerequest and Continue With Next Url"

Post a Comment

Iklan Atas Artikel

Iklan Tengah Artikel 1

Iklan Tengah Artikel 2

Iklan Bawah Artikel