Created
December 1, 2010 18:43
-
-
Save nileshtrivedi/723980 to your computer and use it in GitHub Desktop.
Scraping fun with Google, Flipkart and Ruby
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'net/http' | |
require 'uri' | |
require 'csv' | |
google_url = "http://www.google.com/search?q=%22public+wishlist%22+site:flipkart.com&hl=en&site=webhp&prmd=iv&ei=ZYL2TOr3KYLyrQfv4dzWBg&start=NNNN&sa=N&fp=1b624545158a7512&tch=1&ech=1&psi=MX_2TOSlL82BrQegxqDYBg129122283549813" | |
File.open("flipkart-public-wishlists.txt","w") { |f| | |
(0..54).to_a.each { |page| | |
puts "scraping page #{page} from google web search" | |
start = page * 10 | |
url = google_url.sub(/NNNN/,start.to_s) | |
uri = URI.parse(url) | |
sleep(2) #dear google, please be nice :) | |
response = Net::HTTP.get_response(uri) | |
matches = response.body.scan(/www.flipkart.com\/wishlist[^ +\\]*/).uniq #look for this pattern | |
matches.each { |m| f.puts m } | |
} | |
puts "Done" | |
} | |
wishlists = CSV.read("flipkart-public-wishlists.txt").collect { |row| row.first }.collect { |url| "http://#{url}" } | |
File.open("flipkart-wishlist-raw-data.csv","w") { |f| | |
wishlists.each_with_index { |w,count| | |
sleep(2) #you too, flipkart :) | |
puts "trying wishlist #{count}: #{w}" | |
uri = URI.parse(w) | |
response = Net::HTTP.get_response(uri) | |
items = response.body.scan(/<div class="search_result_title">\s+<a href="([^"]+)" title="([^"]+)"/) #array of two items, first is url of the book, second is book title | |
prices = response.body.scan(/<span class="search_results_price">Price: (<span class="search_results_list_price">[^<]*<\/span>|Not Available)?[^<]*(<font color='#993300'><b>Rs. (\d+)<\/b>)?/) #array of 3 items | |
puts "Sizes do not match" if items.size != prices.size | |
items.each_with_index { |it,c| | |
f.puts "#{w},#{it[0]},\"#{it[1]}\",#{prices[c][0]},#{prices[c][1]},#{prices[c][2]}" #dump the raw data in a CSV file. We'll use OpenOffice's pivot tables to do the analysis | |
} | |
} | |
puts "Done" | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment