#!/usr/bin/python
#
# goopy: python module for google searches. Version 0.5
#	Written by Michael G. (mynameisfiber@gmail.com) (copyleft)
#
# This version can handle:
# 	Websites (local or foreign language)
# 	Video Results
# 	Files (ie: pdf, ps, ppt, etc.)
# 	Recommended Search Terms
# 	
#	=====================================================================
#
#	This program is free software: you can redistribute it and/or modify
#	it under the terms of the GNU General Public License as published by
#	the Free Software Foundation, either version 3 of the License, or
#	(at your option) any later version.
#	
#	This program is distributed in the hope that it will be useful,
#	but WITHOUT ANY WARRANTY; without even the implied warranty of
#	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#	GNU General Public License for more details.
#	
#	You should have received a copy of the GNU General Public License
#	along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
#	=====================================================================

from urllib import quote_plus
import urllib2
import re
from textwrap import wrap

class websearch:
	"""A module to utilize Google web search"""
	def __init__(self, query):
		self.opener = urllib2.build_opener()
		self.find_results = re.compile("<div class=g( style=\"margin-left:2.5em;\")?><a href=\"(?P<url>.+?)\""+
			" class=l>(?P<title>.+?)</a>(<nobr>(.+?)</nobr>)?<table border=0 cellpadding=0 cellspacing=0>"+
			"<tr><td class=\"j( hc)?\"><div class=std>(.+?)<br>")
		self.find_videos = re.compile("<td valign=top><a href=\"(?P<url>.+?)\" class=l>(?P<title>.+?)</a><br><font size=-1>(?P<desc>.+?)<br>")
		self.find_files = re.compile("\[(?P<type1>.{2,5})\]</b></font></span> <a href=\"(?P<url>.+?)\" class=l>"+
			"(?P<title>.+?)</a><table border=0 cellpadding=0 cellspacing=0><tr><td class=\"j\"><div class=std>"+
			"<span class=f>File Format:</span> (?P<type2>.+?)( - |<br>)(.+?)<br>(?P<desc>.+?)<br>")
		self.find_recommend = re.compile("<font color=\"#cc0000\" class=p>Did you mean( to search for)?: </font><a"+
			" href=\"(.+?)\" class=p><b><i>(?P<recommend>.+?)</i></b></a>")
		self.find_num_results = re.compile("<font size=-1>Results <b>[0-9,]+</b> - <b>[0-9,]+</b> of about <b>(?P<results>[0-9,]+)</b>")
		self.clean_regex = re.compile("(<(/)?(.+?)>|&(.+?);)")
		self.results = []
		self.query = query
		self.offset = 0
		self.recommend = ""
		self.numResults = 0
		self.iterate = 0
		self.search(self.query)
		
	def clean(self,string):
		"""Takes all HTML tags out of the search results"""
		return self.clean_regex.sub("",string)
		
	def format_width(self,string):
		return "\n\t".join(wrap(string,60))
		
	def search(self, query, offset=0):
		"""Preforms the search and retrieves the results at `offset`"""
		start=""
		if(offset>0): start="&start=%d"%offset
		request = urllib2.Request('http://www.google.com/search?q='+quote_plus(query)+start)
		self.opener.addheaders = [('User-Agent', 'google-cli')]
		data = self.opener.open(request).read()
		# file("test.html","w").writelines(data)
		try:
			self.numResults = self.find_num_results.findall(data)[0]
		except:
			self.numResults = 0
		tmp = self.find_recommend.findall(data)
		if len(tmp) != 0:
			self.recommend = tmp[0][2] 
		self.query = query
		self.offset = offset
		self.results.extend(self.extract_results(data))
		return self.results
		
	def search_next(self):
		"""Helper fuction to preform subsequent searches after the first"""
		return self.search(self.query, self.offset+10)

	def extract_results(self,data):
		"""Goes through the retrieved HTML and extracts search results"""
		resultsWeb = self.find_results.findall(data)
		resultsVideo = self.find_videos.findall(data)
		resultsFiles = self.find_files.findall(data)
		return [{'url': x[1], 'title': "[%s] %s"%(self.clean(x[0]),self.clean(x[2])), 'desc': self.clean(x[6])} for x in resultsFiles] +\
			[{'url': x[0], 'title': self.clean(x[1]), 'desc': self.clean(x[2])} for x in resultsVideo] +\
			[{'url': x[1], 'title': self.clean(x[2]), 'desc': self.clean(x[6])} for x in resultsWeb]
	
	def show_results(self, index, number):
		"""Shows the `number` results starting at `index` in a pretty format"""
		counter = 0
		for x in self.get_results(index,number):
			print "%d) %s\n\t%s\n\t%s"%(index+counter,x["title"],x["url"],self.format_width(x["desc"]))
			counter+=1
			
	def get_results(self, index, number):
		"""Gets and returns results starting at index up to index+number in a list"""
		while(len(self.results)<index+number and self.numResults != 0):
			self.search_next()
		return self.results[index:index+number]
		
	def next(self):
		"""Offeres the next value for iteration.  If the result has not yet been
		aquired from google then it is now.."""
		self.iterate +=1 
		try:
			return self.get_results(self.iterate-1, 1)[0]
		except:
			raise StopIteration()
		
	def __iter__(self):
		"""Setup iteration"""
		self.iterate = 0
		return self
		
	def __repr__(self):
		"""<type 'goopy.websearch'>"""
		return "<type 'goopy.websearch'>"
		
	def __len__(self):
		"""Returns the amount of results found"""
		return len(self.results)
		
	def __getitem__(self,key):
		"""Returns item or slice of items requested."""
		try:
			high = key.stop
		except:
			high = key
		self.get_results(high,1)
		return self.results[key]
		
	def __str__(self):
		return "\n\n".join(["%s (%s)\n\t%s"%(x["title"],x["url"],self.format_width(x["desc"])) for x in self.results])

