学习交流,发布抓网页小说源代码(可包含图片),内详

学习交流,发布抓网页小说源代码(可包含图片),内详

不好意思,再这里再发一道...

想在工作生活中多用RUBY,平时喜欢看小说,发觉古今书屋的书更新比较快,就写了个RUBY小程序来抓古今书屋小说的相关网页,网页中可包含图片,这里发布的只是自己随便用的小程序,肯定有很多不足,有不满意的请自己修改,这里发布只是为了和大家交流下写RUBY的代码,因为我发觉网上发布的实用点的RUBY源代码太少了

调用的批处理文件内容比如 sbzl.bat 内容如下
ruby getbookimage.rb 随波逐流之神龙传奇 http://www.gjsww.com/Html/Book/17/621/List.html
就可以在执行getbookimg.rb的当前目录下建立文件夹并下载相关网页了

文件名为getbookimg.rb

require 'uri'
require 'iconv'
require 'hpricot'
require 'net/http'
require "open-uri"
require 'fileutils'
##require 'extensions/string'

def get(loc, encoding=nil)
  uri = URI.parse(loc)
  msg = Net::HTTP.get(uri)
 
  #if (!encoding)
   return msg
  #end

  #return Iconv.conv('gbk', encoding, msg)
end

localdirname = "小说中文名"
base = "包含小说总目录的网页地址"

localdirname = ARGV[0]
base = ARGV[1]

dir_expand_path = File.expand_path(".")

dir_book_path = dir_expand_path + "/" + localdirname
dir_book_path_2 = dir_book_path + "/"

fnmax = 0
Dir[dir_book_path_2 + "*.htm*"].sort.each { |x|

rl2 = x.rindex('.')

iv2 = x[dir_book_path_2.length..rl2-1]

if ((iv2.to_i > 0) and (iv2.to_i > fnmax))
  fnmax = iv2.to_i
end
}


book_content_path = ""
book_image_path = ""

doc = ""
html = ""

host = ""

webbookpath = ""
webbookcontentimagepath = ""

uselocalfiletodebug = false
if (uselocalfiletodebug)
  base = "list.html"
  doc = Hpricot.parse(File.read(base))
else
  html = get("#{base}", 'gbk')
  ##puts html
  doc = Hpricot.parse(html)
end

m = %r<http://([^/]+)>.match(base) or raise ArgumentError, "cannot parse URI: #{url_str}"
host = "http://" + m[1].strip
path = m.post_match
path = '/' if path.empty?

puts "主机名 #{host}"
puts "书籍所在网页路径 #{path}"

rl = path.rindex('/')
path = path[0..rl-1]
puts "分析出下载书籍网页的本地路径 path #{path}"

rl = base.rindex('/')
webbookpath = base[0..rl-1]
puts "发现下载书籍所在网页路径 webbookpath #{webbookpath}"


(doc/"a").each do |php?name=link" onclick="tagshow(event)" class="t_tag">link|
 shref = link.attributes['href']
 if (shref == nil)
  next
 end
 sind = shref.index('.html')
 if (sind == nil)
  next
 end
 #puts "发现书籍章节相对链结数据 #{link}"

#link = "993170.html"
link_v = link.attributes['href']
rl = link_v.rindex('.')

iv = link_v[0..rl-1]

#puts iv
 
if ((iv.to_i < fnmax) and (iv.to_i != 0))
  #puts "书籍章节相对链结数据 #{link} 已经被下载,跳过..."
  next
end

 sind = (link.attributes['href']).index('/')
 if ((sind == nil) || (sind > 0))
  webbookcontentimagepath = webbookpath + "/" + link.attributes['href']
  book_content_path = dir_book_path + "/" + link.attributes['href']
 else
  webbookcontentimagepath = webbookpath + link.attributes['href']
  book_content_path = dir_book_path + link.attributes['href']
 end

 puts "发现书籍章节相对链结数据 #{link}"
 #puts "分析后书籍章节内容下载路径 #{webbookcontentimagepath}"
 #puts "分析后书籍章节内容本地保存路径 #{book_content_path}"
 #puts "分析后书籍章节内容本地保存目录 #{File.dirname(book_content_path)}"
 FileUtils.makedirs(File.dirname(book_content_path))


#############3
if (uselocalfiletodebug)
  base2 = "993170.html"
  doc2 = Hpricot.parse(File.read(base))
else
  base2 = webbookcontentimagepath
  html2 = get("#{base2}", 'gbk')
  ##puts html2
  doc2 = Hpricot.parse(html2)
end

(doc2/"img").each do |imagelink|
 imagehtmlpath = imagelink.attributes['src']
 #imagelink.attributes['src'] = "." + imagehtmlpath
 if (imagehtmlpath == nil)
  next
 end
 sind2 = imagehtmlpath.index('.gif')
 if (sind2 == nil)
  next
 end
 
 #puts "在书籍章节内容中 发现图像链结数据 imagelink #{imagelink}"

 sind2 = (imagelink.attributes['src']).index('/')
 if ((sind2 == nil) || (sind2 > 0))
  webbookcontentimagepath = host + "/" + imagelink.attributes['src']
  book_image_path = dir_book_path + "/" + imagelink.attributes['src']
 else
  webbookcontentimagepath = host + imagelink.attributes['src']
  book_image_path = dir_book_path + imagelink.attributes['src']
 end#if sind2 == nill end

 puts "分析后图像最终下载链接 webbookcontentimagepath #{webbookcontentimagepath}"
 #puts "分析后图像最终本地保存路径 book_image_path #{book_image_path}"
 #puts "分析后图像最终本地保存目录 #{File.dirname(book_image_path)}"
 FileUtils.makedirs(File.dirname(book_image_path))
 
 needrewritedata = false
 data=open(webbookcontentimagepath.strip){|f|
 if ((""+f.base_uri.to_s).eql?(webbookcontentimagepath))#如果返回的URL地址与传入的地址相等,说明文件存在可以准备下载
  #puts "如果返回的URL地址与传入的地址相等,说明文件在网上有可以准备下载"
  #puts f.content_type
  if (File.exist?(book_image_path))#如果本地文件存在
    #puts "如果本地文件存在1#{File.size?(book_image_path)}"
    #puts "如果本地文件存在2#{f.meta['content-length']}"

    if ((File.size?(book_image_path)).to_i != (f.meta['content-length']).to_i) #但是2者不相等,就需要重新下载了
     puts "文件网上被更新,2者不相等,需要重新下载"
     #f.read
     needrewritedata = true
    else
     puts "图像文件下载后网上没有被更新,不用重新下载"
    end
  else#如果本地文件不存在,那么直接下载
    #puts "如果本地文件不存在,那么直接下载"
    #f.read
    needrewritedata = true
  end
 end
 }
 if (needrewritedata)
  data=open(webbookcontentimagepath){|f|
    f.read  }
  open(book_image_path,"wb"){|f|f.write(data)}
 end #needrewritedate end
 
end#do img end

(doc2/"img").each do |imagelink|
 imagehtmlpath = imagelink.attributes['src']
 imagelink.attributes['src'] = "." + imagehtmlpath
end

begin
  outputfile = book_content_path

  f = open(outputfile, 'wb')

  htmlbook = ""#get("#{webbookcontentimagepath}", 'gbk')
 
  f.puts doc2
ensure
 f.close # ... and this always happens.
end 


end
可以把这两个包传 'iconv' 'fileutils'上来吗?

谢谢
引用:
原帖由 xnine 于 2007-11-22 08:20 发表
可以把这两个包传 'iconv' 'fileutils'上来吗?

谢谢
恩 在线安装不能,
也没有找到gem。...
引用:
原帖由 xnine 于 2007-11-22 12:49 发表
恩 在线安装不能,
也没有找到gem。...
lz也用Hpricot来解析html呀

不知道效率较之正则匹配怎么样
现在还不是很懂 不过感谢楼主贴出来 学习了