Thursday, December 5, 2013

A Python- Flask Project: To crawl urls from different search engines


The pycode for request file is as follow:


from celery import Celery
from flask import Flask, render_template, request
from flask.ext.paginate import Pagination
from flask.ext.mongoengine import MongoEngine
from mongoengine import connect
import datetime, random, json


app = Flask(__name__)

app.config['MONGODB_SETTINGS'] = {'DB': 'gsearch'}
db = MongoEngine(app)

abc = Celery('project',broker='amqp://guest@localhost//',backend='mongodb://localhost')

class r42kennel(db.Document):
    name=db.StringField(max_length=255)
    slug=db.StringField(max_length=255)
    to_be_searched = db.BooleanField(default=True)


class EcommercePortal(db.Document):
    netloc = db.StringField(max_length=255, required=True)
    is_active=db.BooleanField(default=True)
    def __unicode__(self):
        return self.netloc


class searched_result(db.Document):
    slug=db.StringField()
    url=db.StringField()
    status=db.BooleanField(default=False)
    searched_on = db.DateTimeField(default = datetime.datetime.now())

class select_search_engine(db.Document):
    search_engine = db.StringField()
    is_alive=db.BooleanField(default=True)



@app.route('/')
def main():
    return render_template('home.html')


@app.route('/search', methods=['GET','POST'])
def search():
    if request.method == 'GET':
        r42product = r42kennel.objects.filter(to_be_searched=True)[0]
        print r42product
        return json.dumps({'query_string':'buy %s in india' %r42product.name,'slug': r42product.slug})
    data = json.loads(request.data)
    slug = data.keys()[0]
    result= data.values()[0]
    for i in result:
        r = searched_result()
        r['url']= i
        r['slug'] = slug
        r.save()

    searched_result.objects.filter(slug=slug).update(
        set__status=True,
        set__searched_on=datetime.datetime.now()
        )
    r42product = r42kennel.objects.filter(slug=slug).update(
        set__to_be_searched=False)

    return render_template("search_urls.html", required_urls=result)

@app.route('/search_url')
def search_url():
    search = searched_result.objects.filter(status=True)
    pagination = Pagination(page = 2, per_page = 10, search=search)
    return render_template('showresult_DB.html', pagination=pagination,search = search)

@app.route('/remove', methods=['GET','POST'])
def remove():
    ep = EcommercePortal.objects.filter(is_active=True)
    return render_template('edit_portal.html', ep=ep)



if __name__ == '__main__':
    app.run(host="0.0.0.0", debug=True)


Search.py:

from lxml import etree
import requests, urllib, json
import mongoengine, random
from request_file import EcommercePortal
from twisted.internet.task import LoopingCall
from twisted.internet import reactor


mongoengine.connect('gsearch')


def GoogleSearch(data):
    print "GoogleSearch"
    search_url_list = []
    for i in range(0,5):
        query_dict={
        'q': data['query_string'],
        'start':i*10
        }
        query_string = urllib.urlencode(query_dict)
        b=requests.get("http://www.google.com/search?"+query_string)
        tree=etree.HTML(b.text)
        
        for i in tree.xpath("//li[@class='g']//h3//a/@href"):
            i= i.replace("/url?q=", "")
            search_url_list.append(i)
    data_dict={
    'urls':search_url_list,
    'slug':data['slug']
    }
    result_dict = extract(data_dict)
    return result_dict





def extract(data_dict):
    required_url=[]
    ecommerce_portal = list(EcommercePortal.objects.filter(is_active=True).values_list('netloc'))
    # print ecommerce_portal
    # open('ecommerce_portal','w').write(str(ecommerce_portal))
    for url in data_dict['urls']:
        if ".com/" in url:
            check_url = url.split(".com/",1)[0].replace('www.','')+".com"
            if check_url in ecommerce_portal:
                required_url.append(url)
        elif ".in/" in url:
            check_url = url.split(".in/",1)[0].replace('www.','')+".in"
            if check_url in ecommerce_portal:
                required_url.append(url)
    filter_url=[]
    for lis in required_url:
        if 'ebay.in/' in lis:
            filter_url.append(lis)
        else:
            lis=lis.split('&',1)[0]
            filter_url.append(lis)
    refined_url=set(filter_url)
    
    # print result_dict
    return refined_url



def YahooSearch(data):
    print "YahooSearch"
    search_url_list=list()
    for i in range(0,5):
        query_dict={
        'p': data['query_string'],
        'b':i*10+1
        }
        query_string = urllib.urlencode(query_dict)
        page_response = requests.get("http://in.search.yahoo.com/search;_ylt=?"+query_string)
        tree = etree.HTML(page_response.text)
        for i in tree.xpath("//div[@id='web']//div[@class='res']//h3//a/@href"):
            search_url_list.append(i)
    
    data_dict={
    'urls':search_url_list,
    'slug':data['slug']
    }
    result_dict = extract(data_dict)
    return result_dict

def BingSearch(data):
    print "BingSearch"
    search_url_list=[]
    for i in range(0,5):
        query_dict={
        'q': data['query_string'],
        'first':i*10+1
        }
        query_string = urllib.urlencode(query_dict)
        page_response = requests.get("http://www.bing.com/search?"+query_string)
        tree = etree.HTML(page_response.text)
        for i in tree.xpath("//div[@class='sb_tlst']//h3/a/@href"):
            search_url_list.append(i)
    data_dict={'urls':search_url_list,'slug':data['slug']}
    result_dict = extract(data_dict)
    return result_dict

def AskSearch(data):
    print "AskSearch"
    search_url_list=[]
    for i in range(0,5):
        query_dict={'page' : i,'q': data['query_string']}
        query_string = urllib.urlencode(query_dict)
        page_response = requests.get("http://www.ask.com/web?"+query_string)
        tree = etree.HTML(page_response.text)
        for i in tree.xpath("//div[@class='wresult tsrc_tled']//a/@href"):
            search_url_list.append(i)
    data_dict={
    'urls':search_url_list,
    'slug':data['slug']
    }
    result_dict = extract(data_dict)
    return result_dict


def start_function():
    result = requests.get("http://0.0.0.0:5000/search")
    data_dict = result.json()
    print type(data_dict)
    search_engine=['GoogleSearch(data_dict)', 'YahooSearch(data_dict)', 'BingSearch(data_dict)', 'AskSearch(data_dict)']
    random.shuffle(search_engine)
    sd = eval(search_engine[0])
    result = sd
    another_result = list(result)
    res={
    data_dict['slug'] : another_result
    }
    a = json.dumps(res)
    print a
    requests.post("http://0.0.0.0:5000/search",data= json.dumps(res))
    # sd(data_dict)

if __name__ == '__main__':
    # data_dict={'query_string': 'buy sony xperia L india', 'slug':'sony'}
    # GoogleSearch(data_dict)
    s = LoopingCall(start_function)
    s.start(30)
    reactor.run()

Tuesday, December 3, 2013

Asynchronous Programming and Twisted

What is Twisted?

Twisted is a event driven network framework, it is written in python and is FOSS(Free and Open Source Software) under MIT Licence.

An Introduction to Asynchronous Programming and Twisted

Synchronous programming

The simplest style of programming. Each task is perfomed one at a time, with one finishing completely before another is started. And if the tasks are always performed in a definite order, the implementation of a later task can assume that all earlier tasks have finished without errors, with all their output available for use — a definite simplification in logic. In this model, each task is performed in a separate thread of control. The threads are managed by the operating system and may, on a system with multiple processors or multiple cores, run truly concurrently, or may be interleaved together on a single processor. The point is, in the threaded model the details of execution are handled by the OS and the programmer simply thinks in terms of independent instruction streams which may run simultaneously. Although the diagram is simple, in practice threaded programs can be quite complex because of the need for threads to coordinate with one another. Thread communication and coordination is an advanced programming topic and can be difficult to get right.

In this model, each task is performed in a separate thread of control. The threads are managed by the operating system and may, on a system with multiple processors or multiple cores, run truly concurrently, or may be interleaved together on a single processor. The point is, in the threaded model the details of execution are handled by the OS and the programmer simply thinks in terms of independent instruction streams which may run simultaneously. Although the diagram is simple, in practice threaded programs can be quite complex because of the need for threads to coordinate with one another. Thread communication and coordination is an advanced programming topic and can be difficult to get right.

Now start writing some code to understand better


from twisted.internet import reactor, protocol, endpoints
class UpperProtocol(protocol.Protocol):
    def connectionMade(self):
        self.transport.write("Hi its my first twisted code")
        def connectionLost(self,reason):
            pass
        def dataReceived(self, data):
            self.transport.write(data.upper())
            self.transport.loseConnection()
factory=protocol.ServerFactory()
factory.protocol = UpperProtocol

endpoints.serverFromString(reactor, "tcp:8080").listen(factory)

reactor.listenTCP(8080,factory)
reactor.run()

this is the server side code for twisted, which is running on port number 8080 of the localhost.

client side using twisted can be written as follows

from twisted.internet import reactor, protocol, endpoints
class UppercaseClientProtocol(protocol.Protocol):
    def connectionMade(self):
        self.transport.write(self.factory.text)
        self.transport.write("\r \n")
    def dataReceived(self, data):
        print data
if __name__ == '__main__':
    import sys
    assert ':' in sys.argv[1], "need host:port for argument 1"
    data_to_send = sys.argv[2:]
    endpoint = endpoints.clientFromString(reactor, "tcp:"+sys.argv[1])

    for data in data_to_send:
        print "sending", data
        factory = protocol.ClientFactory()
        factory.protocol = UppercaseClientProtocol
        factory.text = data
        endpoint.connect(factory)
reactor.run()

Monday, November 25, 2013

First Python Script


Today I am writing about python basics. Python is a high level, scripting language. Linux terminal is used as a interpreter for python. Python is an object oriented programming language, it use very few punctuation for construction of syntax but follows indentation very strictly.

write basic python program:

  • 1. open a text file and write this code into it.
  • #!/usr/bin/python
    def main():
    print "Hello World!!!!!"
    if __name__=="__main__":
        main()

  • 2.Save this file as Hello_World.py
  • 3.Now go to the directory where you have saved the file through terminal and and type.
  • $python Hello_World.py

    Sunday, November 24, 2013