- rubyでクローラ
- hatenaダイアリーの2018年の全記事のURL取得
require 'anemone' urls=['http://d.hatena.ne.jp/arupaka-_-arupaka/archive'] Anemone.crawl(urls,:delay=>3,:depth_limit=>2) do |anemone| anemone.focus_crawl do |page| #page.links.keep_if{|link| link.to_s.match(/archive\/2[0-9][0-9].*/)} page.links.keep_if{|link| link.to_s.match(/.*\/2018[0-9][0-9].*/)} #page.links.keep_if{|link| link.to_s.match(/.*\/2[0-9][0-9].*/)} end anemone.on_every_page do |page| puts page.url #puts page.body end end
#-*- coding: utf-8 require 'anemone' require 'nokogiri' id1="red_9512k" urls=["http://d.hatena.ne.jp/#{id1}/archive"] # #urls=["http://d.hatena.ne.jp/#{id1}/archive/200512"] f_out0=open("data_list/#{id1}.dat","w") Dir.mkdir("data_source/#{id1}") unless FileTest.exists?("data_source/#{id1}") #exit() # # #urls=['http://d.hatena.ne.jp/arupaka-_-arupaka/20170120'] #urls=['http://d.hatena.ne.jp/arupaka-_-arupaka/archive'] Anemone.crawl(urls,:delay=>30,:depth_limit=>2) do |anemone| anemone.focus_crawl do |page| #page.links.keep_if{|link| link.to_s.match(/archive\/2[0-9][0-9].*/)} puts page.links() page.links.keep_if{|link| link.to_s.match(/.*\/[0-9][0-9][0-9][0-9][0-9][0-9].*/)} #page.links.keep_if{|link| link.to_s.match(/.*\/2005[0-9][0-9].*/)} #page.links.keep_if{|link| link.to_s.match(/.*\/2017[0-9][0-9].*/)} #page.links.keep_if{|link| link.to_s.match(/.*\/2[0-9][0-9].*/)} end #anemone.on_every_page do |page| # # puts page.url # #html=Nokogiri(page.body) #end anemone.on_pages_like(/[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]$/) do |page| #anemone.on_pages_like(/20170120$/) do |page| puts page.url page2=Nokogiri::HTML.parse(page.body) v=page2.css("div.body") url3="" time1="" title1="" text1_con="" time1=page2.css('span.date').text() v.css('div.section').each{|j| if not j['id'].to_s.match(/google/) then j.css("h3").each{|k| title1=k.text.to_s.encode('utf-8') url3=k.css('a')[0]['href'] #puts url3 } #j.css("p.sectionfooter").each{|k| # # str1=k.text.split("|")[3] # if not str1.nil? then # time1=str1.gsub(/\s/,'') # # end #} #j.css("p.sectionfooter").each{|k| # # str1=k.text.split("|")[3] # if not str1.nil? then # time1=str1.gsub(/\s/,'') # # end #} text_main1=j.to_s.encode('utf-8') #puts text_main1 # #text1_con=j.text().encode('utf-8').gsub(/\n/,'__SEPSEP__') #text1_con=j.css('pre').text().gsub(/\n/,'__SEPSEP__') text1_con0=j text1_con0.css('a.sectioncategory').remove text1_con=text1_con0.text().gsub(/\n/,'__SEPSEP__') ########puts url3 filename=url3.to_s.gsub(/^\//,'').gsub('/','_PVSE_') filename="data_source/#{id1}/"+filename+".dat" f_out=open(filename,"w") f_out.write(text_main1) f_out.close() write_data=url3+"\t"+time1+"\t"+title1+"\t"+text1_con puts write_data f_out0.puts write_data end } #exit() end end f_out0.close()
- 関数化
#-*- coding: utf-8 require 'anemone' require 'nokogiri' def crowle_h(id1) #id1="red_9512k" #urls=['http://d.hatena.ne.jp/red_9512k/archive'] # urls=["http://d.hatena.ne.jp/#{id1}/archive"] # #urls=["http://d.hatena.ne.jp/#{id1}/archive/200512"] f_out0=open("data_list/#{id1}.dat","w") Dir.mkdir("data_source/#{id1}") unless FileTest.exists?("data_source/#{id1}") #exit() # # #urls=['http://d.hatena.ne.jp/arupaka-_-arupaka/20170120'] #urls=['http://d.hatena.ne.jp/arupaka-_-arupaka/archive'] Anemone.crawl(urls,:delay=>3,:depth_limit=>20) do |anemone| anemone.focus_crawl do |page| #page.links.keep_if{|link| link.to_s.match(/archive\/2[0-9][0-9].*/)} puts page.links() page.links.keep_if{|link| link.to_s.match(/.*\/[0-9][0-9][0-9][0-9][0-9][0-9].*/)} #page.links.keep_if{|link| link.to_s.match(/.*\/2005[0-9][0-9].*/)} #page.links.keep_if{|link| link.to_s.match(/.*\/2017[0-9][0-9].*/)} #page.links.keep_if{|link| link.to_s.match(/.*\/2[0-9][0-9].*/)} end #anemone.on_every_page do |page| # # puts page.url # #html=Nokogiri(page.body) #end anemone.on_pages_like(/[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]$/) do |page| #anemone.on_pages_like(/20170120$/) do |page| puts page.url page2=Nokogiri::HTML.parse(page.body) v=page2.css("div.body") url3="" time1="" title1="" text1_con="" time1=page2.css('span.date').text() v.css('div.section').each{|j| if not j['id'].to_s.match(/google/) then j.css("h3").each{|k| title1=k.text.to_s.encode('utf-8') url3=k.css('a')[0]['href'] #puts url3 } #j.css("p.sectionfooter").each{|k| # # str1=k.text.split("|")[3] # if not str1.nil? then # time1=str1.gsub(/\s/,'') # # end #} #j.css("p.sectionfooter").each{|k| # # str1=k.text.split("|")[3] # if not str1.nil? then # time1=str1.gsub(/\s/,'') # # end #} text_main1=j.to_s.encode('utf-8') #puts text_main1 # #text1_con=j.text().encode('utf-8').gsub(/\n/,'__SEPSEP__') #text1_con=j.css('pre').text().gsub(/\n/,'__SEPSEP__') text1_con0=j text1_con0.css('a.sectioncategory').remove text1_con=text1_con0.text().gsub(/\n/,'__SEPSEP__') ########puts url3 filename=url3.to_s.gsub(/^\//,'').gsub('/','_PVSE_') filename="data_source/#{id1}/"+filename+".dat" f_out=open(filename,"w") f_out.write(text_main1) f_out.close() write_data=url3+"\t"+time1+"\t"+title1+"\t"+text1_con puts write_data f_out0.puts write_data end } #exit() end end f_out0.close() #gc() end id1="tomokk81" crowle_h(id1)
上のは無駄にアクセスしすぎて問題があるので改定版(2018.02.25)
anemone.focus_crawlの利用して移動さける
https://qiita.com/yoshiokaCB/items/99ac16aba790781c5092
https://qiita.com/tady/items/8a954dfcd03521f8a200
#-*- coding: utf-8 require 'anemone' require 'nokogiri' def crowle_h(id1) #id1="red_9512k" #urls=['http://d.hatena.ne.jp/red_9512k/archive'] # urls=["http://d.hatena.ne.jp/#{id1}/archive"] # #urls=["http://d.hatena.ne.jp/#{id1}/archive/200512"] f_out0=open("data_list/#{id1}.dat","w") Dir.mkdir("data_source/#{id1}") unless FileTest.exists?("data_source/#{id1}") #exit() # # #urls=['http://d.hatena.ne.jp/arupaka-_-arupaka/20170120'] #urls=['http://d.hatena.ne.jp/arupaka-_-arupaka/archive'] #Anemone.crawl(urls,:delay=>3,:depth_limit=>2) do |anemone| #Gets last url number # url_list=[] 1.times.each{ Anemone.crawl(urls,:delay=>3,:depth_limit=>30) do |anemone| anemone.focus_crawl do |page| page.links.keep_if{|link| link.to_s.match(/word=\&of=[0-9][0-9]*/) } end anemone.on_every_page do |page| puts page.url #puts page.links #links=page.links.select{|i| i.to_s.match(/.*hatena.ne.jp*\/[0-9][0-9]*\/[0-9][0-9]*/)} # links=page.links.select{|i| i.to_s.match(/.*hatena.ne.jp.*\/[0-9][0-9]*\/[0-9][0-9]*/)} #puts links url_list=url_list+(links.map{|i| i.to_s}) end end url_list.uniq!.sort! puts url_list f1=open("data_list/url_list_"+id1+".csv","w") url_list.each{|i| f1.puts(i.chomp)} #exit() } 0.times.each{ Anemone.crawl(url_list,:delay=>3,:depth_limit=>1) do |anemone| anemone.on_every_page do |page| puts page.url end end } #exit() 1.times.each{ Anemone.crawl(url_list,:delay=>3,:depth_limit=>1) do |anemone| anemone.on_every_page do |page| puts page.url page2=Nokogiri::HTML.parse(page.body) v=page2.css("div.body") url3="" time1="" title1="" text1_con="" time1=page2.css('span.date').text() v.css('div.section').each{|j| if not j['id'].to_s.match(/google/) then j.css("h3").each{|k| title1=k.text.to_s.encode('utf-8') url3=k.css('a')[0]['href'] #puts url3 } #j.css("p.sectionfooter").each{|k| # # str1=k.text.split("|")[3] # if not str1.nil? then # time1=str1.gsub(/\s/,'') # # end #} #j.css("p.sectionfooter").each{|k| # # str1=k.text.split("|")[3] # if not str1.nil? then # time1=str1.gsub(/\s/,'') # # end #} text_main1=j.to_s.encode('utf-8') #puts text_main1 # #text1_con=j.text().encode('utf-8').gsub(/\n/,'__SEPSEP__') #text1_con=j.css('pre').text().gsub(/\n/,'__SEPSEP__') text1_con0=j text1_con0.css('a.sectioncategory').remove text1_con=text1_con0.text().gsub(/\n/,'__SEPSEP__') ########puts url3 filename=url3.to_s.gsub(/^\//,'').gsub('/','_PVSE_') filename="data_source/#{id1}/"+filename+".dat" f_out=open(filename,"w") f_out.write(text_main1) f_out.close() write_data=url3+"\t"+time1+"\t"+title1+"\t"+text1_con puts write_data f_out0.puts write_data end } #exit() #end end end } f_out0.close() #gc() end
さらにバグがあるので、改良 日付のみのページに対応
#-*- coding: utf-8 require 'anemone' require 'nokogiri' def crowle_h(id1) #id1="red_9512k" #urls=['http://d.hatena.ne.jp/red_9512k/archive'] # urls=["http://d.hatena.ne.jp/#{id1}/archive"] # #urls=["http://d.hatena.ne.jp/#{id1}/archive/200512"] f_out0=open("data_list/#{id1}.dat","w") Dir.mkdir("data_source/#{id1}") unless FileTest.exists?("data_source/#{id1}") #exit() # # #urls=['http://d.hatena.ne.jp/arupaka-_-arupaka/20170120'] #urls=['http://d.hatena.ne.jp/arupaka-_-arupaka/archive'] #Anemone.crawl(urls,:delay=>3,:depth_limit=>2) do |anemone| #Gets last url number # url_list=[] 1.times.each{ Anemone.crawl(urls,:delay=>3,:depth_limit=>1000) do |anemone| anemone.focus_crawl do |page| page.links.keep_if{|link| link.to_s.match(/word=\&of=[0-9][0-9]*/) } end anemone.on_every_page do |page| puts page.url links=page.links #links=page.links.select{|i| i.to_s.match(/.*hatena.ne.jp*\/[0-9][0-9]*\/[0-9][0-9]*/)} # #links=page.links.select{|i| i.to_s.match(/.*hatena.ne.jp.*\/[0-9][0-9]*\/[0-9][0-9]*/)} # #puts links #exit() links=page.links.select{|i| i.to_s.match(/.*hatena.ne.jp\/#{id1}\/[0-9][0-9]*.*[0-9]*$/)} #puts links #exit() url_list=url_list+(links.map{|i| i.to_s}) end end if not url_list.nil? then url_list=url_list.uniq.sort url_list_s=url_list.select{|i| i.to_s.match(/.*hatena.ne.jp\/#{id1}\/[0-9][0-9]*$/)} url_list_out=url_list.select{|i| i.to_s.match(/.*hatena.ne.jp\/#{id1}\/[0-9][0-9]*\/[0-9][0-9]*$/)} #puts url_list_s #exit() url_list_s.each{|j| #puts j.split("/").reverse[0] #if url_list.include?(/.*#{j.split("/").reverse[0]}.*/) then # if url_list_out.any?{|k| k.include?(j.split("/").reverse[0])} then url_list.delete(j) #puts "del" end } # end puts url_list f1=open("data_list/url_list_"+id1+".csv","w") if not url_list.nil? then url_list.each{|i| f1.puts(i.chomp)} end f1.close() #exit() } #exit() # 0.times.each{ Anemone.crawl(url_list,:delay=>3,:depth_limit=>0) do |anemone| anemone.on_every_page do |page| puts page.url end end } #exit() 1.times.each{ Anemone.crawl(url_list,:delay=>3,:depth_limit=>0) do |anemone| anemone.on_every_page do |page| puts page.url page2=Nokogiri::HTML.parse(page.body) v=page2.css("div.body") url3="" time1="" title1="" text1_con="" time1=page2.css('span.date').text() v.css('div.section').each{|j| if not j['id'].to_s.match(/google/) then j.css("h3").each{|k| title1=k.text.to_s.encode('utf-8') url3=k.css('a')[0]['href'] #puts url3 } #j.css("p.sectionfooter").each{|k| # # str1=k.text.split("|")[3] # if not str1.nil? then # time1=str1.gsub(/\s/,'') # # end #} #j.css("p.sectionfooter").each{|k| # # str1=k.text.split("|")[3] # if not str1.nil? then # time1=str1.gsub(/\s/,'') # # end #} text_main1=j.to_s.encode('utf-8') #puts text_main1 # #text1_con=j.text().encode('utf-8').gsub(/\n/,'__SEPSEP__') #text1_con=j.css('pre').text().gsub(/\n/,'__SEPSEP__') text1_con0=j text1_con0.css('a.sectioncategory').remove text1_con=text1_con0.text().gsub(/\n/,'__SEPSEP__') ########puts url3 filename=url3.to_s.gsub(/^\//,'').gsub('/','_PVSE_') filename="data_source/#{id1}/"+filename+".dat" f_out=open(filename,"w") f_out.write(text_main1) f_out.close() write_data=url3+"\t"+time1+"\t"+title1+"\t"+text1_con puts write_data f_out0.puts write_data end } #exit() #end end end } f_out0.close() #gc() end