python爬⼿机号_python⼿机号前7位归属地爬⾍代码实例需求分析
项⽬上需要⽤到⼿机号前7位,判断号码是否合法,还有归属地查询。旧的数据是⼏年前了太久了,打算⽤python爬⾍重新爬⼀份
单线程版本
# coding:utf-8
import requests
from datetime import datetime
class PhoneInfoSpider:
def __init__(lf, phoneSections):
lf.phoneSections = phoneSections
def phoneInfoHandler(lf, textData):
text = textData.splitlines(True)
# print("text length:" + str(len(text)))
if len(text) >= 9:
number = text[1].split('\'')[1]
province = text[2].split('\'')[1]
mobile_area = text[3].split('\'')[1]
postcode = text[5].split('\'')[1]
line = "number:" + number + ",province:" + province + ",mobile_area:" + mobile_area + ",postcode:" + postcode
line_text = number + "," + province + "," + mobile_area + "," + postcode
print(line_text)
# print("province:" + province)
try:
f = open('./', 'a')
f.write(str(line_text) + '\n')
except Exception as e:
print(Exception, ":", e)
def requestPhoneInfo(lf, phoneNum):
try:
url = '/cc/json/mobile_tel_gment.htm?tel=' + phoneNum
respon = (url)
lf.)
except Exception as e:
def requestAllSections(lf):
# last⽤于接上次异常退出前的号码
last = 0
# last = 4
# ⾃动⽣成⼿机号码,后四位补0
for head in lf.phoneSections:
head_begin = w()
print(head + " begin time:" + str(head_begin))
# for i in range(last, 10000):
for i in range(last, 10):
middle = str(i).zfill(4)
phoneNum = head + middle + "0000"
last = 0
head_end = w()
print(head + " end time:" + str(head_end))
if __name__ == '__main__':
task_begin = w()
print("phone check begin time:" + str(task_begin))
# 电信,联通,移动,虚拟运营商
dx = ['133', '149', '153', '173', '177', '180', '181', '189', '199']
lt = ['130', '131', '132', '145', '146', '155', '156', '166', '171', '175', '176', '185', '186', '166']
yd = ['134', '135', '136', '137', '138', '139', '147', '148', '150', '151', '152', '157', '158', '159', '172', '178', '182', '183', '184', '187', '188', '198']
add = ['170']
all_num = dx + lt + yd + add
# print(all_num)
print(len(all_num))
# 要爬的号码段
spider = PhoneInfoSpider(all_num)
task_end = w()
print("phone check end time:" + str(task_end))
多线程版本
# coding:utf-8
import requests
from datetime import datetime
import queue
import threading
threadNum = 32
class MyThread(threading.Thread):
def __init__(lf, func):
threading.Thread.__init__(lf)
lf.func = func
def run(lf):
lf.func()
def requestPhoneInfo():
global lock
while True:
lock.acquire()
if q.qsize() != 0:
print("queue size:" + str(q.qsize()))
p = q.get() # 获得任务
middle = str(9999 - q.qsize()).zfill(4)
phoneNum = phone_head + middle + "0000"
print("phoneNum:" + phoneNum)
try:
url = '/cc/json/mobile_tel_gment.htm?tel=' + phoneNum # print(url)
respon = (url)
# )
)
except Exception as e:
print(Exception, ":", e)
el:
break
def phoneInfoHandler(textData):
text = textData.splitlines(True)
if len(text) >= 9:
number = text[1].split('\'')[1]
province = text[2].split('\'')[1]
mobile_area = text[3].split('\'')[1]
postcode = text[5].split('\'')[1]
line = "number:" + number + ",province:" + province + ",mobile_area:" + mobile_area + ",postcode:" + postcode line_text = number + "," + province + "," + mobile_area + "," + postcode
print(line_text)
# print("province:" + province)
try:
f = open('./', 'a')
f.write(str(line_text) + '\n')
except Exception as e:
print(Exception, ":", e)
if __name__ == '__main__':
task_begin = w()
print("phone check begin time:" + str(task_begin))
dx = ['133', '149', '153', '173', '177', '180', '181', '189', '199']
lt = ['130', '131', '132', '145', '155', '156', '166', '171', '175', '176', '185', '186', '166']
yd = ['134', '135', '136', '137', '138', '139', '147', '150', '151', '152', '157', '158', '159', '172', '178',
'182', '183', '184', '187', '188', '198']
all_num = dx + lt + yd
print(len(all_num))
for head in all_num:
head_begin = w()
print(head + " begin time:" + str(head_begin))
q = queue.Queue()
threads = []
lock = threading.Lock()
for p in range(10000):
print(q.qsize())
for i in range(threadNum):
middle = str(i).zfill(4)
global phone_head
phone_head = head
thread = MyThread(requestPhoneInfo)
thread.start()
threads.append(thread)
for thread in threads:
thread.join()
head_end = w()
print(head + " end time:" + str(head_end))
task_end = w()
print("phone check end time:" + str(task_end))
多线程版的1个号码段1000条数据,⼤概2,3min就好,cpu使⽤飙升,⼤概维持在70%左右。总共40多个号段,爬完⼤概1,2个⼩时,总数据41w左右
以上就是本⽂的全部内容,希望对⼤家的学习有所帮助,也希望⼤家多多⽀持python博客。