from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys import pandas as pd import time import requests from bs4 import BeautifulSoup as BS from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.chrome.service import Service from webdriver_manager.chrome import ChromeDriverManager from multiprocessing import Pool from concurrent.futures import ThreadPoolExecutor user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36' option = Options() option.add_argument('--no-sandbox') option.add_argument("--disable-infobars") option.add_argument(f'user-agent={user_agent}') option.add_argument('--headless') data2 = pd.read_excel('ST.xlsx') def get_page_url(pn): driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=option) driver.implicitly_wait(2) driver.get(f'https://www.digikey.com/en/products/result?keywords={pn}') time.sleep(1) if '/detail' in driver.current_url: url = driver.current_url driver.quit() return url soup = BS(driver.page_source, 'lxml') driver.quit() def get_page_data(row): pn = row['Название'] driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=option) driver.implicitly_wait(2) driver.get(f'https://www.digikey.com/en/products/result?keywords={pn}') time.sleep(1) if '/detail' in driver.current_url: soup = BS(driver.page_source, 'lxml') driver.quit() tab = soup.find_all('table', class_='jss62 jss58') try: table = pd.read_html(str(tab)) table = pd.DataFrame(table[-2]) msl = table['Description'].values[table.loc[table['Attribute'] == 'Moisture Sensitivity Level (MSL)'].index.values[0]] print(pn, msl) return msl except: print(pn, 'No MSL') try: table = pd.read_html(str(tab)) table = pd.DataFrame(table[-1]) msl = table['Description'].values[ table.loc[table['Attribute'] == 'Moisture Sensitivity Level (MSL)'].index.values[0]] print(pn, msl) return msl except: print(pn, 'No MSL') try: table = pd.read_html(str(tab)) table = pd.DataFrame(table[0]) msl = table['Description'].values[ table.loc[table['Attribute'] == 'Moisture Sensitivity Level (MSL)'].index.values[0]] print(pn, msl) return msl except: print(pn, 'No MSL') try: table = pd.read_html(str(tab)) table = pd.DataFrame(table[-3]) msl = table['Description'].values[ table.loc[table['Attribute'] == 'Moisture Sensitivity Level (MSL)'].index.values[0]] print(pn, msl) return msl except: print(pn, 'No MSL') else: for i in driver.find_elements(By.XPATH, '//*[@id="__next"]/main/div/div/div[1]/div[2]/div[2]/section/div/div/a'): if 'stmicroelectronics' in i.get_attribute('href'): url = i.get_attribute('href') driver.quit() driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=option) driver.get(url) time.sleep(2) soup = BS(driver.page_source, 'lxml') driver.quit() tab = soup.find_all('table', class_='jss62 jss58') # print(tab) try: table = pd.read_html(str(tab)) table = pd.DataFrame(table[-3]) msl = table['Description'].values[table.loc[table['Attribute'] == 'Moisture Sensitivity Level (MSL)'].index.values[0]] print(pn, msl) return msl except: print(pn, 'no_msl') try: table = pd.read_html(str(tab)) table = pd.DataFrame(table[-2]) msl = table['Description'].values[ table.loc[table['Attribute'] == 'Moisture Sensitivity Level (MSL)'].index.values[0]] print(pn, msl) return msl except: print(pn, 'no_msl') try: table = pd.read_html(str(tab)) table = pd.DataFrame(table[-1]) msl = table['Description'].values[ table.loc[table['Attribute'] == 'Moisture Sensitivity Level (MSL)'].index.values[0]] print(pn, msl) return msl except: print(pn, 'no_msl') try: table = pd.read_html(str(tab)) table = pd.DataFrame(table[-4]) msl = table['Description'].values[ table.loc[table['Attribute'] == 'Moisture Sensitivity Level (MSL)'].index.values[0]] print(pn, msl) return msl except: print(pn, 'no_msl') else: pass if __name__ == "__main__": with Pool(32) as p: data2['MSL'] = data2.apply(get_page_data, axis=1) data2.to_excel('ST_MSL2.xlsx', index=False)
Public
Functions should be small - at least fit your screen's height. Otherwise they will be hard to read and hard to test. Try splitting big function into smaller ones.
Using sleep()
and hoping that something will happen within sleep duration (for example, web page loading) is a very bad practice, because nothing guarantees you that. The assumption that some event happens within some time is very weak. Instead of using some duration number, you should strictly rely on mechanisms that report when the page is loaded. As a simple example, use while True
with delays and timeout, and at each iteration check that the page was loaded.
You should include as few lines of code inside try
block as possible. First, you may accidentally catch exceptions from lines which you didn't want to be caught. Second, the smaller try ... except
blocks are, the less cognitive load it makes.
Calculating something multiple times is redundant - use variables to store result of some calculation.
print()
is a nice way to output to stdout. But one day you'll need to not only write to stdout, but also, say, to a file. Another day you'll need to output only severe errors' messages, and nothing else. This all could be solved if using logging
module. Usually it's as easy as from logging import getLogger; log = getLogger(__name__)
.
Exceptions should be easy to catch. If your code throws only Exception
or ValueError
, then it's very hard to catch specific errors, because all thrown exception classes are the same. Create application-specific exceptions, so that every logical error has its own exception class: class VerySpecificException(Exception): pass
"Early quit" is a common pattern which reduces branching and indents in python code. Instead of writing if a: <1000 lines of code> else: <return / raise exception>
, revert the condition: if not a: <return / raise exception>
. This way the "else" clause is not needed, and program logic is straightforward: if something is wrong, quit or raise, otherwise go on.
This code is not really needed or may be simplified
Pool is created but not used
Create new review request