Done

SimeonYS · Mar 25, 2021 · d18f26d · d18f26d
commit d18f26d
Show file tree

Hide file tree

Showing 20 changed files with 257 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+.idea/
+gh.exe
+upload.bat
diff --git a/README.md b/README.md
@@ -0,0 +1,9 @@
+URL: https://banksouthern.com/news/
+
+Spider name: bancorp
+DB Schema:
+
+date
+title
+link
+content
diff --git a/bancorp.db b/bancorp.db
diff --git a/bancorp/__init__.py b/bancorp/__init__.py
diff --git a/bancorp/__pycache__/__init__.cpython-38.pyc b/bancorp/__pycache__/__init__.cpython-38.pyc
diff --git a/bancorp/__pycache__/items.cpython-38.pyc b/bancorp/__pycache__/items.cpython-38.pyc
diff --git a/bancorp/__pycache__/pipelines.cpython-38.pyc b/bancorp/__pycache__/pipelines.cpython-38.pyc
diff --git a/bancorp/__pycache__/settings.cpython-38.pyc b/bancorp/__pycache__/settings.cpython-38.pyc
diff --git a/bancorp/items.py b/bancorp/items.py
@@ -0,0 +1,8 @@
+import scrapy
+
+
+class BancorpItem(scrapy.Item):
+    title = scrapy.Field()
+    content = scrapy.Field()
+    date = scrapy.Field()
+    link = scrapy.Field()
diff --git a/bancorp/middlewares.py b/bancorp/middlewares.py
@@ -0,0 +1,103 @@
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+# useful for handling different item types with a single interface
+from itemadapter import is_item, ItemAdapter
+
+
+class CreditosportivoSpiderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, or item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Request or item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+
+
+class CreditosportivoDownloaderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
diff --git a/bancorp/pipelines.py b/bancorp/pipelines.py
@@ -0,0 +1,32 @@
+import sqlite3
+
+
+class BancorpPipeline:
+
+    # Database setup
+    conn = sqlite3.connect('bancorp.db')
+    c = conn.cursor()
+
+    def open_spider(self, spider):
+        self.c.execute("""CREATE TABLE IF NOT EXISTS `bancorp`
+                         (date text, title text, link text, content text)""")
+
+    def process_item(self, item, spider):
+        self.c.execute("""SELECT * FROM bancorp WHERE title = ? AND date = ?""",
+                       (item.get('title'), item.get('date')))
+        duplicate = self.c.fetchall()
+        if len(duplicate):
+            return item
+        print(f"New entry added at {item['link']}")
+
+        # Insert values
+        self.c.execute("INSERT INTO bancorp (date, title, link, content)"
+                       "VALUES (?,?,?,?)", (item.get('date'), item.get('title'), item.get('link'), item.get('content')))
+        self.conn.commit()  # commit after every entry
+
+        return item
+
+    def close_spider(self, spider):
+        self.conn.commit()
+        self.conn.close()
+
diff --git a/bancorp/settings.py b/bancorp/settings.py
@@ -0,0 +1,15 @@
+BOT_NAME = 'bancorp'
+
+SPIDER_MODULES = ['bancorp.spiders']
+NEWSPIDER_MODULE = 'bancorp.spiders'
+FEED_EXPORT_ENCODING = 'utf-8'
+LOG_LEVEL = 'ERROR'
+DOWNLOAD_DELAY = 0
+USER_AGENT="Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36"
+
+ROBOTSTXT_OBEY = True
+
+ITEM_PIPELINES = {
+	'bancorp.pipelines.BancorpPipeline': 300,
+
+}
diff --git a/bancorp/spiders/__init__.py b/bancorp/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/bancorp/spiders/__pycache__/__init__.cpython-38.pyc b/bancorp/spiders/__pycache__/__init__.cpython-38.pyc
diff --git a/bancorp/spiders/__pycache__/blog.cpython-38.pyc b/bancorp/spiders/__pycache__/blog.cpython-38.pyc
diff --git a/bancorp/spiders/__pycache__/spider.cpython-38.pyc b/bancorp/spiders/__pycache__/spider.cpython-38.pyc
diff --git a/bancorp/spiders/blog.py b/bancorp/spiders/blog.py
@@ -0,0 +1,39 @@
+import re
+import scrapy
+from scrapy.loader import ItemLoader
+from ..items import BancorpItem
+from itemloaders.processors import TakeFirst
+
+pattern = r'(\xa0)?'
+
+
+class BlogSpider(scrapy.Spider):
+    name = 'blog'
+    start_urls = ['https://banksouthern.com/blog/']
+    ITEM_PIPELINES = {
+        'blog.pipelines.BancorpPipeline': 300,
+    }
+    def parse(self, response):
+        post_links = response.xpath('//h2/a/@href').getall()
+        yield from response.follow_all(post_links, self.parse_post)
+
+        next_page = response.xpath('//div[@class="alignleft"]/a/@href').get()
+        if next_page:
+            yield response.follow(next_page, self.parse)
+
+    def parse_post(self, response):
+        date = response.xpath('//span[@class="published"]/text()').get()
+        title = response.xpath('//h1/text()').get()
+        content = response.xpath('//div[@class="entry-content"]//text()').getall()
+        content = [p.strip() for p in content if p.strip()]
+        content = re.sub(pattern, "", ' '.join(content))
+
+        item = ItemLoader(item=BancorpItem(), response=response)
+        item.default_output_processor = TakeFirst()
+
+        item.add_value('title', title)
+        item.add_value('link', response.url)
+        item.add_value('content', content)
+        item.add_value('date', date)
+
+        yield item.load_item()
diff --git a/bancorp/spiders/spider.py b/bancorp/spiders/spider.py
@@ -0,0 +1,37 @@
+import re
+import scrapy
+from scrapy.loader import ItemLoader
+from ..items import BancorpItem
+from itemloaders.processors import TakeFirst
+
+pattern = r'(\xa0)?'
+
+class BancorpSpider(scrapy.Spider):
+	name = 'bancorp'
+	start_urls = ['https://banksouthern.com/news/']
+
+	def parse(self, response):
+		post_links = response.xpath('//h2/a/@href').getall()
+		yield from response.follow_all(post_links, self.parse_post)
+
+		next_page = response.xpath('//div[@class="alignleft"]/a/@href').get()
+		if next_page:
+			yield response.follow(next_page, self.parse)
+
+
+	def parse_post(self, response):
+		date = response.xpath('//span[@class="published"]/text()').get()
+		title = response.xpath('//h1/text()').get()
+		content = response.xpath('//div[@class="entry-content"]//text()').getall()
+		content = [p.strip() for p in content if p.strip()]
+		content = re.sub(pattern, "",' '.join(content))
+
+		item = ItemLoader(item=BancorpItem(), response=response)
+		item.default_output_processor = TakeFirst()
+
+		item.add_value('title', title)
+		item.add_value('link', response.url)
+		item.add_value('content', content)
+		item.add_value('date', date)
+
+		yield item.load_item()
diff --git a/main.py b/main.py
@@ -0,0 +1,2 @@
+from scrapy import cmdline
+cmdline.execute("scrapy crawl bancorp".split())
diff --git a/scrapy.cfg b/scrapy.cfg
@@ -0,0 +1,5 @@
+[settings]
+default = bancorp.settings
+
+[deploy]
+project = bancorp
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from scrapy import cmdline
		cmdline.execute("scrapy crawl bancorp".split())