#Coded by Andrew C
from subprocess import call, sys
def install(package):
call([
sys.executable,
"-m",
"pip",
"--disable-pip-version-check",
"-q",
"install",
package
])
install('selenium')
install('beautifulsoup4')
install('datetime')
install('lxml')
install('openpyxl')
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
from bs4 import BeautifulSoup
import re
import time
import pandas as pd
import re
import datetime
from openpyxl import load_workbook
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
def airline():
origin = "MIA"
destination = "SAV"
startdate = "2023-07-04"
enddate = "2023-07-11"
requests = 2
url = "https://www.TravelSite.com/flights/"+ origin + "-" + destination + "/" + startdate + "/" + enddate + "=stops=0"
#Chrome Driver Code
chrome_options = webdriver.ChromeOptions()
agents = ["Firefox/66.0.3","Chrome/73.0.3683.68","Edge/16.16299"]
print("User agent: " + agents[(requests%len(agents))])
chrome_options.add_argument('--user-agent=' + agents[(requests%len(agents))] + '"')
chrome_options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome("chromedriver.exe", options=chrome_options, desired_capabilities=chrome_options.to_capabilities())
driver.implicitly_wait(20)
driver.get(url)
time.sleep(5)
soup=BeautifulSoup(driver.page_source, 'lxml')
if soup.find_all('p')[0].getText() == "Please confirm that you are a real user.":
print("No Access.Try Again.")
driver.close()
time.sleep(20)
return "failure"
time.sleep(20)
#Web Scrapping
soup=BeautifulSoup(driver.page_source, 'lxml')
deptimes = soup.find_all('div', attrs={'class': 'VY2U'})
deptime = []
for div in deptimes:
deptime.append(div.getText())
regex = re.compile('f8F1-price-text')
price_list = soup.find_all('div', attrs={'class': regex})
price = []
for div in price_list:
price.append(div.getText())
mylist = list(deptime)
INDAY = mylist[::2]
OUTDAY = mylist[1::2]
deptime_o = [s[0:7] for s in INDAY]
arrtime_d = [s[8:15] for s in INDAY]
deptime_d = [s[0:7] for s in OUTDAY]
arrtime_o = [s[8:15] for s in OUTDAY]
Airline_o = [s[15:] for s in INDAY]
Airline_d = [s[15:] for s in OUTDAY]
#Clean up Date data
file_lst_trimmed1 = [re.sub(r'^(–?)(\w\w?)(:)(\w\w)(\W)(a\w?)', r'\2\3\4 am', file) for file in deptime_o]
next_file_lst_trimmed1 = [re.sub(r'^(–?)(\w\w?)(:)(\w\w)(\W?)(p)(\w?)', r'\2\3\4 pm', file) for file in file_lst_trimmed1]
next_file_lst_trimmed11 = [re.sub(r'^(–?)(12)(:)(\w\w)', r'\2\3\4', file) for file in next_file_lst_trimmed1]
file_lst_trimmed2 = [re.sub(r'^(–?)(\w\w?)(:)(\w\w)(\W)(a\w?)', r'\2\3\4 am', file) for file in arrtime_d]
next_file_lst_trimmed2 = [re.sub(r'^(–?)(\w\w?)(:)(\w\w)(\W?)(p)(\w?)', r'\2\3\4 pm', file) for file in file_lst_trimmed2]
next_file_lst_trimmed22 = [re.sub(r'^(–?)(12)(:)(\w\w)', r'\2\3\4', file) for file in next_file_lst_trimmed2]
file_lst_trimmed3 = [re.sub(r'^(–?)(\w\w?)(:)(\w\w)(\W)(a\w?)', r'\2\3\4 am', file) for file in deptime_d]
next_file_lst_trimmed3 = [re.sub(r'^(–?)(\w\w?)(:)(\w\w)(\W?)(p)(\w?)', r'\2\3\4 pm', file) for file in file_lst_trimmed3]
next_file_lst_trimmed33 = [re.sub(r'^(–?)(12)(:)(\w\w)', r'\2\3\4', file) for file in next_file_lst_trimmed3]
file_lst_trimmed4 = [re.sub(r'^(–?)(\w\w?)(:)(\w\w)(\W)(a\w?)', r'\2\3\4 am', file) for file in arrtime_o]
next_file_lst_trimmed4 = [re.sub(r'^(–?)(\w\w?)(:)(\w\w)(\W?)(p)(\w?)', r'\2\3\4 pm', file) for file in file_lst_trimmed4]
next_file_lst_trimmed44 = [re.sub(r'^(–?)(12)(:)(\w\w)', r'\2\3\4 pm', file) for file in next_file_lst_trimmed4]
file_lst_trimmed1 = [re.sub(r'^([a-z]+)(\w*)', r'\2', file) for file in Airline_o]
file_lst_trimmed2 = [re.sub(r'^([a-z]+)(\w*)', r'\2', file) for file in Airline_d]
myprice = price[::2]
ct = datetime.datetime.now()
#Pull it all together in a dataframe
df = pd.DataFrame({"Timestamp": ct,
"Origin" : origin,
"Destination" : destination,
"Flyout_Date" : startdate,
"Flyback_Date" : enddate,
"Price": myprice,
"Flyout_D": next_file_lst_trimmed11,
"Flyout_A": next_file_lst_trimmed22,
"Flyback_D": next_file_lst_trimmed33,
"Flyback_A": next_file_lst_trimmed44,
"Flyout_Airline": file_lst_trimmed1,
"Flyback_Airline": file_lst_trimmed2
})
df.to_csv(r'E:\test.csv',index = False,header = True)
driver.close()
time.sleep(15)
return "success"
airline()
Смотрите видео Python Bytes - Web Scrapping Flight Prices to Excel BeautifulSoup Code in Descrptn онлайн без регистрации, длительностью часов минут секунд в хорошем качестве. Это видео добавил пользователь AC 26 Июнь 2023, не забудьте поделиться им ссылкой с друзьями и знакомыми, на нашем сайте его посмотрели 821 раз и оно понравилось 15 людям.