பயனர்:Info-farmer/ruby-wiki screen scrape

require "date"
require "mechanize"
mechanize = Mechanize.new do |a|
  a.follow_meta_refresh = true
  a.keep_alive = false
end
page = mechanize.get('https://commons.wikimedia.org/wiki/Category:PDF_files_in_Tamil_with_English_Titles')
links = []
page.search('#mw-category-media li .gallerytext a').each do |link|
  links << link['href']
end
host_name = page.uri.hostname
scheme = page.uri.scheme

File.open("url_#{DateTime.now.to_s}.txt",'w') do |f|
  links.each do |link|
    link_with_base = "#{scheme}://#{host_name}#{link}"
    # puts "Getting for...#{link_with_base}"
    page = mechanize.get(link_with_base)
    file_link = page.search('#mw-content-text .fullMedia a')[0]
    file_data = page.search('#mw-content-text .fullMedia span.fileInfo')[0]
    puts file_data.inspect
    file_data_text_formated = file_data.text.gsub(/[()]/, "").split(/,\s/).join("~")
    output = "#{file_link['title']}~#{file_link['href']}~#{file_data_text_formated}"
    puts output
    f.puts output
  end
end

puts "Completed...."