Now you only need to combine all the interesting elements from the website to create a feature that you could use in a model to predict economic activity in geographic regions or foot traffic in specific neighborhoods.
With Selenium, you can follow the links to the next pages and quickly build a dataset of over 10,000 restaurants in NYC that you could then update periodically to track a time series. First, we set up a function that parses the content of the pages that we plan on crawling:
def parse_html(html):
data, item = pd.DataFrame(), {}
soup = BeautifulSoup(html, 'lxml')
for i, resto in enumerate(soup.find_all('div', class_='rest-row-
info')):
item['name'] = resto.find('span', class_='rest-row-name-
text').text
booking = resto.find('div', class_='booking')
item['bookings'] = re.search('d+', booking.text).group() if
booking else 'NA'
rating = resto.select('div.all-stars.filled')
item['rating'] = int(re.search('d+',
rating[0].get('style')).group()) if rating else 'NA'
reviews = resto.find('span', class_='star-rating-text--review-
text')
item['reviews'] = int(re.search('d+', reviews.text).group()) if reviews else 'NA'
item['price'] = int(resto.find('div', class_='rest-row-
pricing').find('i').text.count('$'))
item['cuisine'] = resto.find('span', class_='rest-row-meta--
cuisine').text
item['location'] = resto.find('span', class_='rest-row-meta--
location').text
data[i] = pd.Series(item)
return data.T
Then, we start a headless browser that continues to click on the Next button for us and capture the results displayed on each page:
restaurants = pd.DataFrame()
driver = webdriver.Firefox()
url = "https://www.opentable.com/new-york-restaurant-listings"
driver.get(url)
while True:
sleep(1)
new_data = parse_html(driver.page_source)
if new_data.empty:
break
restaurants = pd.concat([restaurants, new_data], ignore_index=True)
print(len(restaurants))
driver.find_element_by_link_text('Next').click()
driver.close()
Websites continue to change, so this code may stop working at some point and will require updating to follow the latest site navigation and bot detection.