侧边栏壁纸
  • 累计撰写 64 篇文章
  • 累计创建 46 个标签
  • 累计收到 94 条评论

目 录CONTENT

文章目录

基于request的携程机票爬取程序

草莓牛奶
2022-06-27 / 3 评论 / 0 点赞 / 937 阅读 / 2,108 字 / 正在检测是否收录...
温馨提示:
「博客文章out of date 会及时更新,无特殊说明仍然有效,欢迎指正内容中的错误」

基于selenium的携程机票爬取程序(绕过反爬)
基于selenium的携程机票爬取程序【V2.0】
注意:以下文章由于携程的更新可能已经失效,如果有需要可以尝试使用selenium(以上文章中),或者自行按照参考文章的流程进行js逆向!!!!!!!

转载一篇基于request和js逆向的携程机票爬取程序——获取携程机票信息(爬虫)

js逆向很完美,但是短时间内大量request会导致IP封禁(基本上秒封)

虽然基于selenium的携程机票爬取程序(绕过反爬),程序的执行效率比不上request,但是能够保证IP不被封禁

import os
import hashlib
import json
import logging
import random
import time
import pandas as pd
import requests
from fake_useragent import UserAgent
from datetime import datetime as dt,timedelta

一、获取原始数据

# 参考文章:
#   - 机场列表 - 维基百科
#     https://zh.wikipedia.org/wiki/%E4%B8%AD%E5%8D%8E%E4%BA%BA%E6%B0%91%E5%85%B1%E5%92%8C%E5%9B%BD%E6%9C%BA%E5%9C%BA%E5%88%97%E8%A1%A8
#   - 携程国际机票sign破解 https://blog.csdn.net/weixin_38927522/article/details/108214323
#   - 至于前端反反爬虫,看完这篇你就可以毕业了 https://zhuanlan.zhihu.com/p/250176143


ua = UserAgent()


def get_cookie_bfa():
    random_str = "abcdefghijklmnopqrstuvwxyz1234567890"
    random_id = ""
    for _ in range(6):
        random_id += random.choice(random_str)
    t = str(int(round(time.time() * 1000)))

    bfa_list = ["1", t, random_id, "1", t, t, "1", "1"]
    bfa = "_bfa={}".format(".".join(bfa_list))
    # e.g. _bfa=1.1639722810158.u3jal2.1.1639722810158.1639722810158.1.1
    return bfa


# 获取调用携程 API 查询航班接口 Header 中所需的参数 sign
def get_sign(transaction_id, departure_city_code, arrival_city_code, departure_date):
    sign_value = transaction_id + departure_city_code + arrival_city_code + departure_date
    _sign = hashlib.md5()
    _sign.update(sign_value.encode('utf-8'))
    return _sign.hexdigest()


# 获取 transactionID 及航线数据
def get_transaction_id(departure_city_code, arrival_city_code, departure_date, cabin):
    flight_list_url = "https://flights.ctrip.com/international/search/api/flightlist" \
                      "/oneway-{}-{}?_=1&depdate={}&cabin={}&containstax=1" \
        .format(departure_city_code, arrival_city_code, departure_date, cabin)
    flight_list_req = requests.get(url=flight_list_url)
    if flight_list_req.status_code != 200:
        logging.error("get transaction id failed, status code {}".format(flight_list_req.status_code))
        return "", None

    try:
        flight_list_data = flight_list_req.json()["data"]
        transaction_id = flight_list_data["transactionID"]
    except Exception as e:
        logging.error("get transaction id failed, {}".format(e))
        return "", None

    return transaction_id, flight_list_data


# 获取航线具体信息与航班数据
def get_flight_info(departure_city_code, arrival_city_code, departure_date, cabin):
    # 获取 transactionID 及航线数据
    transaction_id, flight_list_data = get_transaction_id(departure_city_code, arrival_city_code, departure_date, cabin)
    if transaction_id == "" or flight_list_data is None:
        return False, None

    # 获取调用携程 API 查询航班接口 Header 中所需的参数 sign
    sign = get_sign(transaction_id, departure_city_code, arrival_city_code, departure_date)

    # cookie 中的 bfa
    bfa = get_cookie_bfa()

    # 构造请求,查询数据
    search_url = "https://flights.ctrip.com/international/search/api/search/batchSearch"
    search_headers = {
        "transactionid": transaction_id,
        "sign": sign,
        "scope": flight_list_data["scope"],
        "origin": "https://flights.ctrip.com",
        "referer": "https://flights.ctrip.com/online/list/oneway-{}-{}"
                   "?_=1&depdate={}&cabin={}&containstax=1".format(departure_city_code, arrival_city_code,
                                                                   departure_date, cabin),
        "content-type": "application/json;charset=UTF-8",
        "user-agent": ua.chrome,
        "cookie": bfa,
    }
    r = requests.post(url=search_url, headers=search_headers, data=json.dumps(flight_list_data))

    if r.status_code != 200:
        logging.error("get flight info failed, status code {}".format(r.status_code))
        return False, None

    try:
        result_json = r.json()
        if result_json["data"]["context"]["flag"] != 0:
            logging.error("get flight info failed, {}".format(result_json))
            return False, None
    except Exception as e:
        logging.error("get flight info failed, {}".format(e))
        return False, None

    if "flightItineraryList" not in result_json["data"]:
        result_data = []
    else:
        result_data = result_json["data"]["flightItineraryList"]
    return True, result_data

二、数据处理

def check_data(flightItineraryList):
    for i in range(len(flightItineraryList)-1, -1, -1):
        if flightItineraryList[i]['flightSegments'][0]['transferCount'] !=0:
            flightItineraryList.pop(i)
            
    return flightItineraryList        

    
def proc_flightSegments(flightItineraryList):
    flights = pd.DataFrame()
    
    for flightlist in flightItineraryList:
        flightlist=flightlist['flightSegments'][0]['flightList']
        flightUnitList=dict(flightlist[0])
        
        departureday=flightUnitList['departureDateTime'].split(' ')[0]
        departuretime=flightUnitList['departureDateTime'].split(' ')[1]
        
        arrivalday=flightUnitList['arrivalDateTime'].split(' ')[0]
        arrivaltime=flightUnitList['arrivalDateTime'].split(' ')[1]            
        
        #删除一些不重要的信息
        dellist=['sequenceNo', 'marketAirlineCode',
         'departureProvinceId','departureCityId','departureCityCode','departureAirportShortName','departureTerminal',
         'arrivalProvinceId','arrivalCityId','arrivalCityCode','arrivalAirportShortName','arrivalTerminal',
         'transferDuration','stopList','leakedVisaTagSwitch','trafficType','highLightPlaneNo','mealType',
         'operateAirlineCode','arrivalDateTime','departureDateTime','operateFlightNo','operateAirlineName']
        for value in dellist:
            try:
                flightUnitList.pop(value)
            except:
                continue
        
        #更新日期格式
        flightUnitList.update({'departureday': departureday, 'departuretime': departuretime,
                               'arrivalday': arrivalday, 'arrivaltime': arrivaltime}) 
        
        flights=pd.concat([flights,pd.DataFrame(flightUnitList,index=[0])],ignore_index=True)    
        
    return flights
                    
            
def proc_priceList(flightItineraryList):
    prices = pd.DataFrame()
    
    for flightlist in flightItineraryList:
        flightNo=flightlist['itineraryId'].split('_')[0]
        priceList=flightlist['priceList']
        
        #经济舱,经济舱折扣
        economy,economy_discount=[],[]
        #商务舱,商务舱折扣
        bussiness,bussiness_discount=[],[]
        
        for price in priceList:
            adultPrice=price['adultPrice']
            cabin=price['cabin']
            priceUnitList=dict(price['priceUnitList'][0]['flightSeatList'][0])
            discountRate=priceUnitList['discountRate']
            #经济舱
            if cabin=='Y':
                economy.append(adultPrice)
                economy_discount.append(discountRate)
             #商务舱
            elif cabin=='C':
                bussiness.append(adultPrice)
                bussiness_discount.append(discountRate)
        
        if economy !=[]:
            try:
                economy_origin=economy[economy_discount.index(1)]
            except:
                economy_origin=int(max(economy)/max(economy_discount))
        
            if min(economy_discount) !=1:
                economy_low=min(economy)
                economy_cut=min(economy_discount)
            else:
                economy_low=''
                economy_cut=''
            
        else:
            economy_origin=''
            economy_low=''
            economy_cut=''
        
        if bussiness !=[]: 
            try:
                bussiness_origin=bussiness[bussiness_discount.index(1)]
            except:
                bussiness_origin=int(max(bussiness)/max(bussiness_discount))
        
            if min(bussiness_discount) !=1:
                bussiness_low=min(bussiness)
                bussiness_cut=min(bussiness_discount)
            else:
                bussiness_low=''
                bussiness_cut=''
            
        else:
            bussiness_origin=''
            bussiness_low=''
            bussiness_cut=''        
    
        price_info={'flightNo':flightNo,
                'economy_origin':economy_origin,'economy_low':economy_low,'economy_cut':economy_cut,
                'bussiness_origin':bussiness_origin,'bussiness_low':bussiness_low,'bussiness_cut':bussiness_cut}
        #self.prices=self.prices.append(price_info,ignore_index=True)
        prices=pd.concat([prices,pd.DataFrame(price_info,index=[0])],ignore_index=True)
        
    return prices
        
   
def mergedata(flights,prices,date,city):
    try:
        df =flights.merge(prices,on=['flightNo'])
        
        df['数据获取日期']=dt.now().strftime('%Y-%m-%d')
        
        #对pandas的columns进行重命名
        order=['数据获取日期','航班号','航空公司',
               '出发日期','出发时间','到达日期','到达时间','飞行时长','出发国家','出发城市','出发机场','出发机场三字码',
               '到达国家','到达城市','到达机场','到达机场三字码','飞机型号','飞机尺寸','飞机型号三字码',
               '经济舱原价','经济舱最低价','经济舱折扣','商务舱原价','商务舱最低价','商务舱折扣',
               '到达准点率','停留次数']
        
        origin=['数据获取日期','flightNo','marketAirlineName',
                'departureday','departuretime','arrivalday','arrivaltime','duration',
                'departureCountryName','departureCityName','departureAirportName','departureAirportCode',
                'arrivalCountryName','arrivalCityName','arrivalAirportName','arrivalAirportCode',
                'aircraftName','aircraftSize','aircraftCode',
                'economy_origin','economy_low','economy_cut',
                'bussiness_origin','bussiness_low','bussiness_cut',
                'arrivalPunctuality','stopCount']
        
        columns=dict(zip(origin,order))
        
        df=df.rename(columns=columns)
          
        df = df[order]
        
        
        if not os.path.exists(date):
            os.makedirs(date)      
            
        filename=os.getcwd()+'\\'+date+'\\'+date+'-'+city[0]+'-'+city[1]+'.csv'
        
        df.to_csv(filename,encoding='GB18030',index=False)
        
        print('\n数据爬取完成',filename) 
    except Exception as e:
        print('合并数据失败',e)

        
def getcitycode():
    cityname,code=[],[]
    #采用携程的api接口
    city_url='https://flights.ctrip.com/online/api/poi/get?v='+str(random.random())
    headers={
        'dnt':'1',
        'referer':'https://verify.ctrip.com/',
        'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
        }
    r=requests.get(city_url,headers=headers)
    citys=json.loads(r.text).get('data')
    for city in citys:
        if city =='热门':
            continue
        for key in city:
            try:
                for k in citys[city][key]:
                    cityname.append(k['display'])
                    code.append(k['data'])
            except:
                continue
    citycode=dict(zip(cityname,code))
    
    return cityname,citycode


def process_date(city, citycode,departureDate, cabin="Y_S_C_F"):
    
    departureCityCode=citycode[city[0]][-3:]
    arrivalCityCode=citycode[city[1]][-3:]
    
    ok, result = get_flight_info(departureCityCode, arrivalCityCode, departureDate, cabin)
    
    if ok:
        
        result=check_data(result)
        
        if len(result):

            flights=proc_flightSegments(result)
        
            prices=proc_priceList(result)
        
            mergedata(flights,prices,departureDate,city)

三、主程序

if __name__ == '__main__':

    citys=[]
    cityname,citycode=getcitycode()
    city=['上海','广州','深圳','北京']
    ytic=list(reversed(city))
    for m in city:
        for n in ytic:
            if m==n:
                continue
            else:
                citys.append([m,n])
                

    departureDate=dt.now()+timedelta(days=1)
    departureDate=departureDate.strftime('%Y-%m-%d')
    
    
    for city in citys:
        process_date(city, citycode,departureDate, cabin="Y_S_C_F")
        #time.sleep(10)
0

评论区