利用python的 twisted、django框架可视化抓取代理服务器

在twisted客户端完成第8周爬虫的迁移,抓取结果封装好之后发给服务端,服务端解析后存入proxy表中 实现客户端的计划调度,比如每隔5分钟去服务端取一次抓取URL的任务

思路是:

服务器端:
用自定义开关控制任务
用twisted监听程序
用django实现可视化管理
用python基本模块(urlib2 re)处理好爬虫
models.py:

from django.db import models

# Create your models here.

class Task(models.Model):
url = models.CharField("url",max_length=100)
cre = models.CharField("crawler re",max_length=300)
status = models.IntegerField("status",default=0)
def __unicode__(self):
return "%s<-->%d" %(self.url,self.status)
class Proxy(models.Model):
ip = models.CharField("ip address", max_length=15)
port = models.CharField("ip address", max_length=5)
proxytype = models.CharField("proxy type", max_length=10)
area = models.CharField("country/area", max_length=50)
status = models.IntegerField("status", default=0) #0=No Test active 1=test ok 2=test bad
response = models.FloatField("response time", null=True)
def __unicode__(self):
return "%s<-->%f" %(self.ip,self.response)
class Tclient(models.Model):
name = models.CharField("name",max_length=20)
add_time = models.DateTimeField("add_time",auto_now=True)
def __unicode__(self):
return "%s" %self.name

tw_server.py:

#codeing=utf-8
from twisted.internet import reactor
from twisted.internet.protocol import Factory,Protocol

import sys,os

sys.path.insert(0,r"E:\python\test\twdjcrawler")

from django.conf import settings
os.environ['DJANGO_SETTINGS_MODULE'] = "twdjcrawler.settings"

from django.db.models.loading import get_models
loaded_models = get_models()

from cralwer.models import Task,Proxy

tasks=[]

class SendTask(Protocol):
#获取任务
def get_tasks(self):
tks = Task.objects.filter(status=0)  #获取未分配的抓取任务
tasks.extend(tks)
print tasks
#处理客户端发来的消息
def dealclientmsg(self, clientmsg):
global tasks
msgtype = clientmsg[0] #获取客户端发来的第一个字符,表示消息类型
if msgtype == '0':   #如果消息类型是请求任务
if len(tasks) > 0:
tobj = tasks.pop()  #python list.pop() 方法: 从列表中移除并返回最后一个对象或者obj。
#发送一个任务到客户端,对于抓取代理地址的任务,发送格式为:0@@正则表达式@@url地址
self.transport.write('0@@' + str(tobj.cre) + '@@' + str(tobj.url))
tobj.status = 1
tobj.save()
else:
self.transport.write('1')  #'1'表示暂时没有任务
tasks=[] #清空已发送给客户端的任务列表
self.get_tasks() #重新尝试获取任务
elif msgtype == '1':  #如果消息类型是处理结果
allresults = clientmsg[1:]
proxylist = allresults.split('@') #客户端传递过来的每个代理服务器之间用'@'分隔
print "msgtype = 1"
for pl in proxylist:
#编辑Proxy模型对象
pobj = Proxy()
p = pl.split('|')  #ip地址、端口号、代理类型、地区、状态、响应时间之间用管道符分隔
print p
pobj.ip = p[0]
pobj.port = p[1]
pobj.proxytype = p[2]
pobj.area = p[3].decode('gb2312')
pobj.status = int(p[4])
if pobj.status == 1:
pobj.response = float(p[5])
else:
pobj.response = -1
pobj.save(force_insert=True)
self.transport.write('2')  #告诉客户端已经存放好结果
def connectionMade(self):
#tobj = tasks.pop()
#self.transport.write(tobj.url)
#tobj.status= 1
#tobj.save()
self.get_tasks()
def dataReceived(self,data):
self.dealclientmsg(data)

factory = Factory()
factory.protocol = SendTask

#reactor.callLater(1,get_tasks)
reactor.listenTCP(8001,factory)
reactor.run()

tw_client.py:

#coding=utf-8
from twisted.internet.protocol import Protocol, ClientFactory
from twisted.internet import reactor
import urllib2, re, time

portdict = {'v':'3', 'm':'4', 'a':'2', 'l':'9', 'q':'0', 'b':'5', 'i':'7', 'w':'6', 'r':'8', 'c':'1'}
class Crawler():
def __init__(self, urladdr, pattern):
self.urladdr = urladdr
self.pattern = pattern
self.proxylist = []
def run(self):
global portdict
print 'start crawl %s' %self.urladdr
req = urllib2.Request(url=self.urladdr)
reqresp = urllib2.urlopen(req)
content = reqresp.read()
self.match = re.compile(self.pattern, re.I).findall(content)
for row in self.match:
try:
print row
ip = row[0]
port = row[1]
port = map(lambda x:portdict[x], port.split('+'))
port = ''.join(port)
proxytype = row[2]
area = row[3]
self.proxylist.append([ip, port, proxytype, area])
finally:
print 'erro'

class Validation():
def __init__(self, proxy):
self.website = "http://www.baidu.com/"  #用于验证的页面
self.targetstr = "030173"  #用于在验证页面上查找的字符串
self.proxy = proxy   #代理服务器,是个列表,列表内容为[ip,端口,代理类型,区域]
def run(self):
timeout = 2  #超时时间设置为2秒
cookies = urllib2.HTTPCookieProcessor()
proxhandler = urllib2.ProxyHandler({"%s"%self.proxy[2]:"%s:%s"%(self.proxy[0], self.proxy[1])})
opener = urllib2.build_opener(cookies, proxhandler)
urllib2.install_opener(opener)
print 'start validate %s' %(self.proxy[0])
timestart = time.time()
try:
req = urllib2.Request(url=self.website)
reqresp = urllib2.urlopen(req, timeout=timeout)
content = reqresp.read()
timeend = time.time()
pos = content.find(self.targetstr)
if pos > 1:
self.proxy.append('1')  #状态:验证通过
self.proxy.append(str(round(timeend - timestart, 2)))
#print self.proxy
else:
self.proxy.append('2')  #状态:验证未通过
except Exception,e:
pass

class Dealservermsg(Protocol):
def connectionMade(self):
self.transport.write("0") #请求任务

def dataReceived(self, servermsg):
task = servermsg.split('@@') #服务器传递过来的是否有任务标识、正则表达式、url地址用'@'分隔
flag = task[0]  #获取是否有任务标识
print 'flag=%s' %flag
if flag == '0':   #有任务
#抓取
pattern = task[1]
urladdr = task[2]
C_inst = Crawler(urladdr, pattern)
C_inst.run()
#验证
sendtoserverlist = []
for p in C_inst.proxylist:
V_inst = Validation(p)
V_inst.run()
sendtoserverlist.append('|'.join(V_inst.proxy)) #将验证后的ip、端口、代理类型、地区、状态、响应时间用管道符分隔
#将验证后的结果发送给服务端
self.transport.write('1' + '@'.join(sendtoserverlist))
elif flag == '1': #没有任务
print 'no task currently'
time.sleep(5)
self.transport.write("0") #再次尝试请求任务
elif flag == '2':
self.transport.write("0")

class EchoClientFactory(ClientFactory):
def startedConnecting(self, connector):
print "connection starting..."
def buildProtocol(self, addr):
print addr
return Dealservermsg()
def clientConnectionLost(self, connector, reason):
print "lose reason:", reason
def clientConnectionFailed(self, connector, reason):
print "failed reason:", reason

reactor.connectTCP('127.0.0.1', 8001, EchoClientFactory())
reactor.run()

客户端运行截图

可以看到,一开始没有任务,然后通过django的管理页面添加了一个任务后,客户端开始读取到服务器传来的任务,接下来又没有任务了,然后再次通过django的管理页面添加了两个任务,客户端又能正常抓取和验证代理地址了。
image

数据库存放结果截图:
image

参考学长的:http://f.dataguru.cn/forum.php?mod=viewthread&tid=66360&fromuid=22344

总结:

1.用twisted可以监听任务 比用计划任务定时执行好

2.利用django自带的model 实现可视化管理

3.导入django模块时 和其他的模块不同  要把环境也导入

import sys,os

sys.path.insert(0,r"E:\python\test\twdjcrawler")

from django.conf import settings
os.environ['DJANGO_SETTINGS_MODULE'] = "twdjcrawler.settings"

from django.db.models.loading import get_models
loaded_models = get_models()

from cralwer.models import Task,Proxy

错误处理:

编码问题:把py文件 用文本编辑器另存为UTf-8 解决问题   这个可能是wingide自身的bug

源代码:

http://pan.baidu.com/share/link?shareid=1300995637&uk=3909359194

 

 

您可以选择一种方式赞助本站