新增了纵横中文网《不败战神》抓取

2013年11月04日 23:35

不败战神 抓取

# encoding: utf-8
require 'rubygems'
require 'nokogiri'
require 'open-uri'

class Zongheng

  def self.start
    urls = {
      "不败战神" => "http://book.zongheng.com/showchapter/251393.html"
    }
    start_time = Time.now.to_s
    begin
      urls.each_pair do |name, val|  
        get_chapter_list(name,val)
      end
    rescue Exception => e
      Rails.logger.error "fetch zongheng had a Exception"+e.to_s
    end
    Rails.logger.info "Zongheng end>>>>start_time="+start_time+">>>>end_time="+Time.now.to_s
  end

  def self.get_chapter_list name,base_url
    f = i_open base_url
    return "" unless f
    titles = Nokogiri::HTML(f).css("div.booklist td a")
    titles[-2..-1].each do |title|
      post_content = get_post_content title['href']
      if post_content && post_content.length>100
        # 有内容  则入库,标题,章节内容,小说名,
        novel = Novel.where(:name => name).first_or_create
        chapter = Chapter.where(:name => title.text).first_or_create(:content => post_content,:novel => novel)
        if post_content.length != chapter.content.length
          chapter.content = post_content
          chapter.save
        end
      end
    end
  end

  def self.get_post_content url
    # p url
    f = i_open url
    return "" unless f
    post_content = Nokogiri::HTML(f).css("#chapterContent");
    # 去掉无用链接
    post_content.css(".watermark").remove
    post_content.css("a").remove
    post_content.inner_html.encode("utf-8")
  end

  def self.i_open url
    begin
      open(url,:read_timeout=>20)
    rescue Exception => e
      return nil
    end
  end

end