Ameblo/Livedoor/excite/yaplog ã®ããã°ã¯ãã¼ã©æ¸ãã
ããã°è¨äºãåéããå¿
è¦ãçãã¦ã¯ãã¼ã©æ¸ããï¼
æåã¯
wget -a ./wget.log -w 30 -r -np -m -k -erobots=off -np blog_url
ã¨ããã£ã¦ããã©æå¥ä¸è¦§ãã«ãã´ãªä¸è¦§ï¼ã¢ãã¤ã«çURLãéããã¨ãã¦ããªãéè¤ãã¦ãã¾ãä¸åã«çµãããªãã®ã§ï¼ä¸»è¦ãªããã°ãµã¼ãã¹ã«åããã¦æ¸ããï¼
ã©ããã辿ãã®ãè¨äºã網ç¾
ã§ãããèããã¨ãã
- Livedoor: base_url/archives/year-month.html?p=pos
- ç¶ãããããã©ããã®å¤å®ï¼ããã°ã«ãã£ã¦ã¾ã¡ã¾ã¡ãªã®ã§ãã¼ã¯ã¼ããããã«ãã
- Ameblo: base_url/archive#{pos}-#{year}#{month}.html
- excite: base_url/page/#{pos}
- yaplog: base_url/#{pos}
- base_url/monthly/#{year}#{month}/ ã§è¾¿ãæ¹æ³ããããã©ããã°ã«ãã£ã¦è¨äºãå ¨è¡¨ç¤ºããããã®ã¨è¨äºã¿ã¤ãã«ã ããåºããã®ããã£ãã®ã§é¢åã«è¦ãã
ã§ãã©ãã®ãä¸çªè¯ãããã ã£ãï¼
year/monthã«å½ããé¨åã¯2003å¹´ãã決ãæã¡ã§æ¸ããï¼ããäºåæ¢ç´¢ã¨ããã¦è¨äºãæ¸ããå§ããå¹´æãç¹å®ããã¨ãããã°ãã£ã¨è³¢ããªãã¨æãï¼
ã¨ããããå
¨è¨äºããã§è½ã¡ã¦ããã¯ããªã®ã§ãã¦ã³ãã¼ãå¾ã«åå¥è¨äºãã¨ã«åãåºãã¨ãããã°ããã¨æãï¼
# -*- coding: utf-8 -*- require 'nokogiri' require 'open-uri' WAIT_SEC = 30 UA = '' def get_ameblo(base_url, save_path) base_url.sub!(/\/$/, "") 2003.upto 2012 do |y| 1.upto 12 do |m| m = "0#{m}" if m < 10 pos = 1 loop do url = "#{base_url}/archive#{pos}-#{y}#{m}.html" sleep WAIT_SEC begin doc = Nokogiri::HTML(open(url, 'User-Agent' => UA).read) # save open("#{save_path}/archive#{pos}-#{y}#{m}.html", 'w'){|f| f.puts doc.inner_html } # 次ãåå¨ãããã©ãã end_flag = (doc/'a.nextPage').empty? break if end_flag rescue => e end pos += 1 end end end end def get_excite(base_url, save_path) base_url.sub!(/\/$/, "") pos = 1 loop do url = "#{base_url}/page/#{pos}" sleep WAIT_SEC begin doc = Nokogiri::HTML(open(url, 'User-Agent' => UA).read) # save open("#{save_path}/#{pos}.html", 'w'){|f| f.puts doc.inner_html } # 次ããããã©ãã end_flag = (doc/'a.older_page').empty? break if end_flag rescue => e end pos += 1 end end def get_livedoor(base_url, save_path) base_url.sub!(/\/$/, '') 2003.upto 2012 do |y| 1.upto 12 do |m| m = "0#{m}" if m < 10 pos = 1 loop do page_url = "#{base_url}/archives/#{y}-#{m}.html?p=#{pos}" sleep WAIT_SEC begin doc = Nokogiri::HTML(open(page_url, 'User-Agent' => UA).read) # ä¿å open("#{save_path}/#{y}-#{m}.html_#{pos}", 'w'){|f| f.puts doc.inner_html } # ãã¼ãã«ãã£ã¦ next è¦ç´ ã®ãããªããéãã®ã§ããã©ããããã # 次è¨äºã® URL ããããã©ããã®ãããã§æ¢ã next_url = "#{base_url}/archives/#{y}-#{m}.html?p=#{pos + 1}\"" end_flag = !doc.inner_html.include?(next_url) break if end_flag rescue => e end pos += 1 end end end end def get_yaplog(base_url, save_path) base_url.sub!(/\/$/, "") pos = 1 loop do url = "#{base_url}/#{pos}" sleep WAIT_SEC # æããä¸å®å ¨ãª HTML ãè¿ã£ã¦ãããã¨ãããï¼ # ãã®ããã« Nokogiri ã§ã® parse ã失æãã # ãªã®ã§ï¼ open(url).read ã®ä¸èº«ããã®ã¾ã¾è¦ã begin doc = open(url, 'User-Agent' => UA).read # save open("#{save_path}/#{pos}.html", 'w'){|f| f.puts doc } # check next_link = base_url + '/' + (pos + 1).to_s + '"' end_flag = !doc.include?(next_link) break if end_flag rescue => e end pos += 1 end end