基于selenium的携程机票爬取程序(绕过反爬)
基于selenium的携程机票爬取程序【V2.0】
注意:以下文章由于携程的更新可能已经失效,如果有需要可以尝试使用selenium(以上文章中),或者自行按照参考文章的流程进行js逆向!!!!!!!
转载一篇基于request和js逆向的携程机票爬取程序——获取携程机票信息(爬虫)
js逆向很完美,但是短时间内大量request会导致IP封禁(基本上秒封)
虽然基于selenium的携程机票爬取程序(绕过反爬),程序的执行效率比不上request,但是能够保证IP不被封禁
import os
import hashlib
import json
import logging
import random
import time
import pandas as pd
import requests
from fake_useragent import UserAgent
from datetime import datetime as dt,timedelta
一、获取原始数据
# 参考文章:
# - 机场列表 - 维基百科
# https://zh.wikipedia.org/wiki/%E4%B8%AD%E5%8D%8E%E4%BA%BA%E6%B0%91%E5%85%B1%E5%92%8C%E5%9B%BD%E6%9C%BA%E5%9C%BA%E5%88%97%E8%A1%A8
# - 携程国际机票sign破解 https://blog.csdn.net/weixin_38927522/article/details/108214323
# - 至于前端反反爬虫,看完这篇你就可以毕业了 https://zhuanlan.zhihu.com/p/250176143
ua = UserAgent()
def get_cookie_bfa():
random_str = "abcdefghijklmnopqrstuvwxyz1234567890"
random_id = ""
for _ in range(6):
random_id += random.choice(random_str)
t = str(int(round(time.time() * 1000)))
bfa_list = ["1", t, random_id, "1", t, t, "1", "1"]
bfa = "_bfa={}".format(".".join(bfa_list))
# e.g. _bfa=1.1639722810158.u3jal2.1.1639722810158.1639722810158.1.1
return bfa
# 获取调用携程 API 查询航班接口 Header 中所需的参数 sign
def get_sign(transaction_id, departure_city_code, arrival_city_code, departure_date):
sign_value = transaction_id + departure_city_code + arrival_city_code + departure_date
_sign = hashlib.md5()
_sign.update(sign_value.encode('utf-8'))
return _sign.hexdigest()
# 获取 transactionID 及航线数据
def get_transaction_id(departure_city_code, arrival_city_code, departure_date, cabin):
flight_list_url = "https://flights.ctrip.com/international/search/api/flightlist" \
"/oneway-{}-{}?_=1&depdate={}&cabin={}&containstax=1" \
.format(departure_city_code, arrival_city_code, departure_date, cabin)
flight_list_req = requests.get(url=flight_list_url)
if flight_list_req.status_code != 200:
logging.error("get transaction id failed, status code {}".format(flight_list_req.status_code))
return "", None
try:
flight_list_data = flight_list_req.json()["data"]
transaction_id = flight_list_data["transactionID"]
except Exception as e:
logging.error("get transaction id failed, {}".format(e))
return "", None
return transaction_id, flight_list_data
# 获取航线具体信息与航班数据
def get_flight_info(departure_city_code, arrival_city_code, departure_date, cabin):
# 获取 transactionID 及航线数据
transaction_id, flight_list_data = get_transaction_id(departure_city_code, arrival_city_code, departure_date, cabin)
if transaction_id == "" or flight_list_data is None:
return False, None
# 获取调用携程 API 查询航班接口 Header 中所需的参数 sign
sign = get_sign(transaction_id, departure_city_code, arrival_city_code, departure_date)
# cookie 中的 bfa
bfa = get_cookie_bfa()
# 构造请求,查询数据
search_url = "https://flights.ctrip.com/international/search/api/search/batchSearch"
search_headers = {
"transactionid": transaction_id,
"sign": sign,
"scope": flight_list_data["scope"],
"origin": "https://flights.ctrip.com",
"referer": "https://flights.ctrip.com/online/list/oneway-{}-{}"
"?_=1&depdate={}&cabin={}&containstax=1".format(departure_city_code, arrival_city_code,
departure_date, cabin),
"content-type": "application/json;charset=UTF-8",
"user-agent": ua.chrome,
"cookie": bfa,
}
r = requests.post(url=search_url, headers=search_headers, data=json.dumps(flight_list_data))
if r.status_code != 200:
logging.error("get flight info failed, status code {}".format(r.status_code))
return False, None
try:
result_json = r.json()
if result_json["data"]["context"]["flag"] != 0:
logging.error("get flight info failed, {}".format(result_json))
return False, None
except Exception as e:
logging.error("get flight info failed, {}".format(e))
return False, None
if "flightItineraryList" not in result_json["data"]:
result_data = []
else:
result_data = result_json["data"]["flightItineraryList"]
return True, result_data
二、数据处理
def check_data(flightItineraryList):
for i in range(len(flightItineraryList)-1, -1, -1):
if flightItineraryList[i]['flightSegments'][0]['transferCount'] !=0:
flightItineraryList.pop(i)
return flightItineraryList
def proc_flightSegments(flightItineraryList):
flights = pd.DataFrame()
for flightlist in flightItineraryList:
flightlist=flightlist['flightSegments'][0]['flightList']
flightUnitList=dict(flightlist[0])
departureday=flightUnitList['departureDateTime'].split(' ')[0]
departuretime=flightUnitList['departureDateTime'].split(' ')[1]
arrivalday=flightUnitList['arrivalDateTime'].split(' ')[0]
arrivaltime=flightUnitList['arrivalDateTime'].split(' ')[1]
#删除一些不重要的信息
dellist=['sequenceNo', 'marketAirlineCode',
'departureProvinceId','departureCityId','departureCityCode','departureAirportShortName','departureTerminal',
'arrivalProvinceId','arrivalCityId','arrivalCityCode','arrivalAirportShortName','arrivalTerminal',
'transferDuration','stopList','leakedVisaTagSwitch','trafficType','highLightPlaneNo','mealType',
'operateAirlineCode','arrivalDateTime','departureDateTime','operateFlightNo','operateAirlineName']
for value in dellist:
try:
flightUnitList.pop(value)
except:
continue
#更新日期格式
flightUnitList.update({'departureday': departureday, 'departuretime': departuretime,
'arrivalday': arrivalday, 'arrivaltime': arrivaltime})
flights=pd.concat([flights,pd.DataFrame(flightUnitList,index=[0])],ignore_index=True)
return flights
def proc_priceList(flightItineraryList):
prices = pd.DataFrame()
for flightlist in flightItineraryList:
flightNo=flightlist['itineraryId'].split('_')[0]
priceList=flightlist['priceList']
#经济舱,经济舱折扣
economy,economy_discount=[],[]
#商务舱,商务舱折扣
bussiness,bussiness_discount=[],[]
for price in priceList:
adultPrice=price['adultPrice']
cabin=price['cabin']
priceUnitList=dict(price['priceUnitList'][0]['flightSeatList'][0])
discountRate=priceUnitList['discountRate']
#经济舱
if cabin=='Y':
economy.append(adultPrice)
economy_discount.append(discountRate)
#商务舱
elif cabin=='C':
bussiness.append(adultPrice)
bussiness_discount.append(discountRate)
if economy !=[]:
try:
economy_origin=economy[economy_discount.index(1)]
except:
economy_origin=int(max(economy)/max(economy_discount))
if min(economy_discount) !=1:
economy_low=min(economy)
economy_cut=min(economy_discount)
else:
economy_low=''
economy_cut=''
else:
economy_origin=''
economy_low=''
economy_cut=''
if bussiness !=[]:
try:
bussiness_origin=bussiness[bussiness_discount.index(1)]
except:
bussiness_origin=int(max(bussiness)/max(bussiness_discount))
if min(bussiness_discount) !=1:
bussiness_low=min(bussiness)
bussiness_cut=min(bussiness_discount)
else:
bussiness_low=''
bussiness_cut=''
else:
bussiness_origin=''
bussiness_low=''
bussiness_cut=''
price_info={'flightNo':flightNo,
'economy_origin':economy_origin,'economy_low':economy_low,'economy_cut':economy_cut,
'bussiness_origin':bussiness_origin,'bussiness_low':bussiness_low,'bussiness_cut':bussiness_cut}
#self.prices=self.prices.append(price_info,ignore_index=True)
prices=pd.concat([prices,pd.DataFrame(price_info,index=[0])],ignore_index=True)
return prices
def mergedata(flights,prices,date,city):
try:
df =flights.merge(prices,on=['flightNo'])
df['数据获取日期']=dt.now().strftime('%Y-%m-%d')
#对pandas的columns进行重命名
order=['数据获取日期','航班号','航空公司',
'出发日期','出发时间','到达日期','到达时间','飞行时长','出发国家','出发城市','出发机场','出发机场三字码',
'到达国家','到达城市','到达机场','到达机场三字码','飞机型号','飞机尺寸','飞机型号三字码',
'经济舱原价','经济舱最低价','经济舱折扣','商务舱原价','商务舱最低价','商务舱折扣',
'到达准点率','停留次数']
origin=['数据获取日期','flightNo','marketAirlineName',
'departureday','departuretime','arrivalday','arrivaltime','duration',
'departureCountryName','departureCityName','departureAirportName','departureAirportCode',
'arrivalCountryName','arrivalCityName','arrivalAirportName','arrivalAirportCode',
'aircraftName','aircraftSize','aircraftCode',
'economy_origin','economy_low','economy_cut',
'bussiness_origin','bussiness_low','bussiness_cut',
'arrivalPunctuality','stopCount']
columns=dict(zip(origin,order))
df=df.rename(columns=columns)
df = df[order]
if not os.path.exists(date):
os.makedirs(date)
filename=os.getcwd()+'\\'+date+'\\'+date+'-'+city[0]+'-'+city[1]+'.csv'
df.to_csv(filename,encoding='GB18030',index=False)
print('\n数据爬取完成',filename)
except Exception as e:
print('合并数据失败',e)
def getcitycode():
cityname,code=[],[]
#采用携程的api接口
city_url='https://flights.ctrip.com/online/api/poi/get?v='+str(random.random())
headers={
'dnt':'1',
'referer':'https://verify.ctrip.com/',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
}
r=requests.get(city_url,headers=headers)
citys=json.loads(r.text).get('data')
for city in citys:
if city =='热门':
continue
for key in city:
try:
for k in citys[city][key]:
cityname.append(k['display'])
code.append(k['data'])
except:
continue
citycode=dict(zip(cityname,code))
return cityname,citycode
def process_date(city, citycode,departureDate, cabin="Y_S_C_F"):
departureCityCode=citycode[city[0]][-3:]
arrivalCityCode=citycode[city[1]][-3:]
ok, result = get_flight_info(departureCityCode, arrivalCityCode, departureDate, cabin)
if ok:
result=check_data(result)
if len(result):
flights=proc_flightSegments(result)
prices=proc_priceList(result)
mergedata(flights,prices,departureDate,city)
三、主程序
if __name__ == '__main__':
citys=[]
cityname,citycode=getcitycode()
city=['上海','广州','深圳','北京']
ytic=list(reversed(city))
for m in city:
for n in ytic:
if m==n:
continue
else:
citys.append([m,n])
departureDate=dt.now()+timedelta(days=1)
departureDate=departureDate.strftime('%Y-%m-%d')
for city in citys:
process_date(city, citycode,departureDate, cabin="Y_S_C_F")
#time.sleep(10)
评论区