from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import pandas as pd
import time
import requests
from bs4 import BeautifulSoup as BS
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

from multiprocessing import Pool
from concurrent.futures import ThreadPoolExecutor


user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'
option = Options()
option.add_argument('--no-sandbox')
option.add_argument("--disable-infobars")
option.add_argument(f'user-agent={user_agent}')
option.add_argument('--headless')

data2 = pd.read_excel('ST.xlsx')

def get_page_url(pn):
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=option)
    driver.implicitly_wait(2)
    driver.get(f'https://www.digikey.com/en/products/result?keywords={pn}')
    time.sleep(1)
    if '/detail' in driver.current_url:
        url = driver.current_url
        driver.quit()
        return url
    soup = BS(driver.page_source, 'lxml')
    driver.quit()


def get_page_data(row):
    pn = row['Название']
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=option)
    driver.implicitly_wait(2)
    driver.get(f'https://www.digikey.com/en/products/result?keywords={pn}')
    time.sleep(1)
    if '/detail' in driver.current_url:
        soup = BS(driver.page_source, 'lxml')
        driver.quit()
        tab = soup.find_all('table', class_='jss62 jss58')
        try:
            table = pd.read_html(str(tab))
            table = pd.DataFrame(table[-2])
            msl = table['Description'].values[table.loc[table['Attribute'] == 'Moisture Sensitivity Level (MSL)'].index.values[0]]
            print(pn, msl)
            return msl
        except:
            print(pn, 'No MSL')
        try:
            table = pd.read_html(str(tab))
            table = pd.DataFrame(table[-1])
            msl = table['Description'].values[
                table.loc[table['Attribute'] == 'Moisture Sensitivity Level (MSL)'].index.values[0]]
            print(pn, msl)
            return msl
        except:
            print(pn, 'No MSL')
        try:
            table = pd.read_html(str(tab))
            table = pd.DataFrame(table[0])
            msl = table['Description'].values[
                table.loc[table['Attribute'] == 'Moisture Sensitivity Level (MSL)'].index.values[0]]
            print(pn, msl)
            return msl
        except:
            print(pn, 'No MSL')
        try:
            table = pd.read_html(str(tab))
            table = pd.DataFrame(table[-3])
            msl = table['Description'].values[
                table.loc[table['Attribute'] == 'Moisture Sensitivity Level (MSL)'].index.values[0]]
            print(pn, msl)
            return msl
        except:
            print(pn, 'No MSL')



    else:
        for i in driver.find_elements(By.XPATH, '//*[@id="__next"]/main/div/div/div[1]/div[2]/div[2]/section/div/div/a'):
            if 'stmicroelectronics' in i.get_attribute('href'):
                url = i.get_attribute('href')
                driver.quit()
                driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=option)
                driver.get(url)
                time.sleep(2)
                soup = BS(driver.page_source, 'lxml')
                driver.quit()
                tab = soup.find_all('table', class_='jss62 jss58')
                # print(tab)
                try:
                    table = pd.read_html(str(tab))
                    table = pd.DataFrame(table[-3])
                    msl = table['Description'].values[table.loc[table['Attribute'] == 'Moisture Sensitivity Level (MSL)'].index.values[0]]
                    print(pn, msl)
                    return msl
                except:
                    print(pn, 'no_msl')
                try:
                    table = pd.read_html(str(tab))
                    table = pd.DataFrame(table[-2])
                    msl = table['Description'].values[
                        table.loc[table['Attribute'] == 'Moisture Sensitivity Level (MSL)'].index.values[0]]
                    print(pn, msl)
                    return msl
                except:
                    print(pn, 'no_msl')
                try:
                    table = pd.read_html(str(tab))
                    table = pd.DataFrame(table[-1])
                    msl = table['Description'].values[
                        table.loc[table['Attribute'] == 'Moisture Sensitivity Level (MSL)'].index.values[0]]
                    print(pn, msl)
                    return msl
                except:
                    print(pn, 'no_msl')
                try:
                    table = pd.read_html(str(tab))
                    table = pd.DataFrame(table[-4])
                    msl = table['Description'].values[
                        table.loc[table['Attribute'] == 'Moisture Sensitivity Level (MSL)'].index.values[0]]
                    print(pn, msl)
                    return msl
                except:
                    print(pn, 'no_msl')

            else:
                pass



if __name__ == "__main__":
    with Pool(32) as p:
        data2['MSL'] = data2.apply(get_page_data, axis=1)

data2.to_excel('ST_MSL2.xlsx', index=False)

 Public
Share a link to this review

5.48% issue ratio

R41 Too many lines of code

Functions should be small - at least fit your screen's height. Otherwise they will be hard to read and hard to test. Try splitting big function into smaller ones.

L64 Sleeping with hope

Using sleep() and hoping that something will happen within sleep duration (for example, web page loading) is a very bad practice, because nothing guarantees you that. The assumption that some event happens within some time is very weak. Instead of using some duration number, you should strictly rely on mechanisms that report when the page is loaded. As a simple example, use while True with delays and timeout, and at each iteration check that the page was loaded.

L42 Too big "try" block

You should include as few lines of code inside try block as possible. First, you may accidentally catch exceptions from lines which you didn't want to be caught. Second, the smaller try ... except blocks are, the less cognitive load it makes.

O23 Calculated multiple times

Calculating something multiple times is redundant - use variables to store result of some calculation.

O15 Using print()

print() is a nice way to output to stdout. But one day you'll need to not only write to stdout, but also, say, to a file. Another day you'll need to output only severe errors' messages, and nothing else. This all could be solved if using logging module. Usually it's as easy as from logging import getLogger; log = getLogger(__name__).

L39 Using generic exception

Exceptions should be easy to catch. If your code throws only Exception or ValueError, then it's very hard to catch specific errors, because all thrown exception classes are the same. Create application-specific exceptions, so that every logical error has its own exception class: class VerySpecificException(Exception): pass

R14 Not using "early quit"

"Early quit" is a common pattern which reduces branching and indents in python code. Instead of writing if a: <1000 lines of code> else: <return / raise exception>, revert the condition: if not a: <return / raise exception>. This way the "else" clause is not needed, and program logic is straightforward: if something is wrong, quit or raise, otherwise go on.

L12 Redundant code / overengineering

This code is not really needed or may be simplified

Pool is created but not used


Create new review request