title
1import requests
2from bs4 import BeautifulSoup
3
4url = 'https://www.samplestore.pk/search/?keywords=amazfit&dataBi=k1'
5
6page = requests.get(url)
7soup = BeautifulSoup(page.content, 'html.parser')
8
9# find all <h3> with a class of 'title'
10# titles = soup.find_all('h2', {"class": "title"})
11
12# As of Beautiful Soup 4.1.2, you can search by CSS class using the keyword argument class_:
13titles = soup.find_all("h2", class_="title")
14
15for title in titles:
16 print(title)
1<h2 class="title">
2 <a href="amazfit-stratos-multisport-gps-smartwatch.html" title="">Amazfit Stratos Multisport GPS...</a>
3</h2>
4<h2 class="title"><a href="amazfit-gtr-smartwatch.html" title="">Amazfit GTR Smartwatch</a></h2>
5<h2 class="title"><a href="amazfit-gts-sports-smart-watch.html" title="">Amazfit GTS Sports Smart Watch</a></h2>
6<h2 class="title"><a href="amazfit-bip-lite-smartwatch.html" title="">Amazfit Bip Lite Smartwatch</a></h2>
7<h2 class="title"><a href="amazfit-gts-gtr-charging-cable.html" title="">Amazfit GTS/GTR Charging Cable</a></h2>
8<h2 class="title"><a href="amazfit-t-rex.html" title="">Amazfit T-Rex Military Smart W...</a></h2>
9<h2 class="title"><a href="amazfit-airrun-treadmill.html" title="">Amazfit AirRun Treadmill</a></h2>
10<h2 class="title"><a href="amazfit-neo.html" title="">Amazfit Neo Retro Design</a></h2>
11<h2 class="title"><a href="amazfit-gtr-2.html" title="">Amazfit GTR 2</a></h2>
12<h2 class="title"><a href="amazfit-gts-2.html" title="">Amazfit GTS 2 Smart Watch</a></h2>
13<h2 class="title"><a href="amazfit-bip-u.html" title="">Amazfit Bip U</a></h2>
select
vs find_all
select
over find_all
because select
and it’s CSS like syntax is familiar to me as a web developer.find
is a bit clunky when it comes to finding nested elements.select
returns an array, even if there is only one result1titles = soup.find_all("h2", class_="title")
1<h2 class="title">
2 <a href="amazfit-stratos-multisport-gps-smartwatch.html" title="">Amazfit Stratos Multisport GPS...</a>
3</h2>
4<h2 class="title"><a href="amazfit-gtr-smartwatch.html" title="">Amazfit GTR Smartwatch</a></h2>
5<h2 class="title"><a href="amazfit-gts-sports-smart-watch.html" title="">Amazfit GTS Sports Smart Watch</a></h2>
6<h2 class="title"><a href="amazfit-bip-lite-smartwatch.html" title="">Amazfit Bip Lite Smartwatch</a></h2>
7<h2 class="title"><a href="amazfit-gts-gtr-charging-cable.html" title="">Amazfit GTS/GTR Charging Cable</a></h2>
8<h2 class="title"><a href="amazfit-t-rex.html" title="">Amazfit T-Rex Military Smart W...</a></h2>
9<h2 class="title"><a href="amazfit-airrun-treadmill.html" title="">Amazfit AirRun Treadmill</a></h2>
10<h2 class="title"><a href="amazfit-neo.html" title="">Amazfit Neo Retro Design</a></h2>
11<h2 class="title"><a href="amazfit-gtr-2.html" title="">Amazfit GTR 2</a></h2>
12<h2 class="title"><a href="amazfit-gts-2.html" title="">Amazfit GTS 2 Smart Watch</a></h2>
13<h2 class="title"><a href="amazfit-bip-u.html" title="">Amazfit Bip U</a></h2>
1titles = soup.select('h2.title > a')
<a href="amazfit-stratos-multisport-gps-smartwatch.html" title="">Amazfit Stratos Multisport GPS...</a>
<a href="amazfit-gtr-smartwatch.html" title="">Amazfit GTR Smartwatch</a>
<a href="amazfit-gts-sports-smart-watch.html" title="">Amazfit GTS Sports Smart Watch</a>
<a href="amazfit-bip-lite-smartwatch.html" title="">Amazfit Bip Lite Smartwatch</a>
<a href="amazfit-gts-gtr-charging-cable.html" title="">Amazfit GTS/GTR Charging Cable</a>
<a href="amazfit-t-rex.html" title="">Amazfit T-Rex Military Smart W...</a>
<a href="amazfit-airrun-treadmill.html" title="">Amazfit AirRun Treadmill</a>
<a href="amazfit-neo.html" title="">Amazfit Neo Retro Design</a>
<a href="amazfit-gtr-2.html" title="">Amazfit GTR 2</a>
<a href="amazfit-gts-2.html" title="">Amazfit GTS 2 Smart Watch</a>
<a href="amazfit-bip-u.html" title="">Amazfit Bip U</a>
extract data from a page/site
create a CSV file with that data
Crawl through all Product pages on the site and populate a spreadsheet
Schedule the crawling to run on a frequency (e.g. once a day)
How to find nested tags?
1# Write these to a CSV file
2filename = 'products.csv'
3
4# products.csv file will be created in the current working
5with open( filename, 'w', newline='') as file:
6 # filednames = a list object which should contain the column headers specifying the order in which data should be written in the CSV file
7 fieldnames = ['Title', 'Price']
8 writer = csv.DictWriter(file, fieldnames = fieldnames)
9
10 # first row that contains the column headings
11 writer.writeheader()
12
13 for title, price in zip(titles, prices):
14 writer.writerow({'Title': title.text, 'Price': price.text})
Here are the contents of products.csv
Title,Price
Amazfit Stratos Multisport GPS...,"Rs 28,000"
Amazfit GTR Smartwatch,"Rs 22,500"
Amazfit GTS Sports Smart Watch,"Rs 23,499"
Amazfit Bip Lite Smartwatch,"Rs 9,000"
Amazfit GTS/GTR Charging Cable,Rs 800
Amazfit T-Rex Military Smart W...,"Rs 23,500"
Amazfit AirRun Treadmill,"Rs 149,000"
Amazfit Neo Retro Design,"Rs 6,499"
Amazfit GTR 2,"Rs 31,499"
Amazfit GTS 2 Smart Watch,"Rs 30,599"
Amazfit Bip U,"Rs 11,000"
You can do this with .get('src')
1image.get('src').strip()
This can be done with the string method .strip()
1for title, discount_price, original_price, image in zip(titles, discount_prices, original_prices, images):
2 title = title.text.strip()
3 discount_price = discount_price.text.strip()
4 original_price = original_price.text.strip()
5 image = image.get('src').strip()
6
7 print(title, discount_price, original_price, image)