使用Ruby和Nokogiri模拟爬虫导出RSS种子的实例详解

互联网 17-5-2
# encoding: utf-8  require 'thread'  require 'nokogiri'  require 'open-uri'  require 'rss/maker'     $result=Queue.new  def extract_readme_header(no,name,url)    frame = Nokogiri::HTML(open(url))    return unless frame    readme=$url+frame.css('frame')[1]['src']    return unless readme    open(readme) do |f|      doc = Nokogiri::HTML(f.read)      text=doc.css("div#content div#filecontents p")[0..4].map { |c| c.content }.join(" ").strip      return if text.length==0      if text !~ /(rails)|(activ_)/i        puts "========= #{no} #{name} : #{text[0..50]}"        date = f.last_modified        $result << [no,name,readme,date,text]      end    end  rescue    puts $!.to_s  end     def make_rss(items)    RSS::Maker.make("2.0") do |m|      m.channel.title = "GtitHub recently updated projects"      m.channel.link = "http://localhost"      m.channel.description = "GitHub recently updated projects"      m.items.do_sort = true      items.each do |no,name,url,date,descr|        i = m.items.new_item        i.title = name        i.link = url        i.description=descr        i.date = date      end    end  end     ############################## M A I N ########################     ############# Scan list of recent project     lth=[]  $url="http://rdoc.info"  puts "get url #{$url}..."  doc = Nokogiri::HTML(open($url))  doc.css('ul.libraries')[1].css('li').each_with_index do |li,i|    aname =li.css('a').first    name=aname.content    purl=$url+aname['href']    lth << Thread.new(i,name,purl) { |j,n,u| extract_readme_header(j,n,u)  }  end     ################ wait all readme are read     lth.each { |th| th.join() }     ################ dequeue results and sort them by date descending     result=[]  result << $result.shift while $result.size>0  result.sort!  { |a,b| a[0] <=> b[0] }        ################ format results in rss     File.open("RubyFeeds.rss","w") do |file|    file.write make_rss(result)  end

以上就是使用Ruby和Nokogiri模拟爬虫导出RSS种子的实例详解的详细内容,更多内容请关注技术你好其它相关文章!

来源链接:
免责声明:
1.资讯内容不构成投资建议,投资者应独立决策并自行承担风险
2.本文版权归属原作所有,仅代表作者本人观点,不代表本站的观点或立场
上一篇:php获取远程图片并下载保存到本地的方法分析 下一篇:XmlSlurper解析RSS的实例代码

相关资讯