From 05cc9a881a671271a1fc799509741c42adb35200 Mon Sep 17 00:00:00 2001 From: Sanjay Viswanathan Date: Thu, 6 Jun 2024 20:36:23 +1000 Subject: [PATCH] Revert "[Error Solved] 'executable_path' " --- Web_app/Scarper.py | 198 ++++++++++++++++++++------------------------- 1 file changed, 86 insertions(+), 112 deletions(-) diff --git a/Web_app/Scarper.py b/Web_app/Scarper.py index a16efc7..b50687b 100644 --- a/Web_app/Scarper.py +++ b/Web_app/Scarper.py @@ -1,119 +1,93 @@ -import streamlit as st -from selenium import webdriver -from webdriver_manager.chrome import ChromeDriverManager -from selenium.webdriver.chrome.service import Service +from selenium.webdriver.common.by import By +from selenium.webdriver.common.keys import Keys +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC import time import csv import re from bs4 import BeautifulSoup -import os -from streamlit_lottie import st_lottie -import json - -with open('Movie_Animated.json', encoding='utf-8') as anim_source: - animation_data = json.load(anim_source) - st_lottie(animation_data, 1, True, True, "high", 150, -100) - -# Function to scrape IMDb data -def scrape_imdb_data(): - options = webdriver.ChromeOptions() - options.add_argument('--no-sandbox') - options.add_argument('--disable-dev-shm-usage') - options.add_argument('--headless') # Run Chrome in headless mode - - service = Service(ChromeDriverManager().install()) - driver = webdriver.Chrome(options=options, service=service) - - driver.get('https://www.imdb.com/search/title/?title_type=tv_series,feature,tv_movie,tv_episode,tv_miniseries,tv_special&release_date=2000-01-01,2024-12-31') - driver.set_script_timeout(10000) - - def load_more_results(): - try: - load_more_button = WebDriverWait(driver, 10).until( - EC.element_to_be_clickable((By.XPATH, '//button[contains(@class, "ipc-see-more__button")]')) - ) - driver.execute_script("arguments[0].scrollIntoView(true);", load_more_button) - driver.execute_script("arguments[0].click();", load_more_button) - time.sleep(2) - return True - except Exception as e: - print(f"Error: {e}") - return False - - def save_to_csv(movies, filename='movies.csv'): - file_exists = os.path.isfile(filename) - keys = movies[0].keys() - with open(filename, 'a', newline='', encoding='utf-8') as output_file: - dict_writer = csv.DictWriter(output_file, fieldnames=keys) - if not file_exists: - dict_writer.writeheader() - dict_writer.writerows(movies) +from selenium.webdriver.chrome.options import Options +from selenium import webdriver - all_movies = [] - cnt = 0 - while cnt < 300: - cnt += 1 - if not load_more_results(): +DRIVER_PATH = 'E:/chromedriver-win64/chromedriver' +# Initialize the Chrome driver + + +options = webdriver.ChromeOptions() +options.add_argument('--no-sandbox') +options.add_argument('--disable-dev-shm-usage') +driver = webdriver.Chrome(options=options,executable_path=DRIVER_PATH) + +# Navigate to the URL +driver.get('https://www.imdb.com/search/title/?title_type=tv_series,feature,tv_movie,tv_episode,tv_miniseries,tv_special&release_date=2000-01-01,2024-12-31') + +driver.set_script_timeout(10000) +def load_more_results(): + try: + load_more_button = WebDriverWait(driver, 10).until( + EC.element_to_be_clickable((By.XPATH, '//button[contains(@class, "ipc-see-more__button")]')) + ) + driver.execute_script("arguments[0].scrollIntoView(true);", load_more_button) + driver.execute_script("arguments[0].click();", load_more_button) + time.sleep(2) + return True + except Exception as e: + print(f"Error: {e}") + return False +def save_to_csv(movies, filename='movies.csv'): + keys = movies[0].keys() + with open(filename, 'a', newline='', encoding='utf-8') as output_file: + dict_writer = csv.DictWriter(output_file, fieldnames=keys) + dict_writer.writeheader() + dict_writer.writerows(movies) + + +all_movies=[] +cnt=0 +while(cnt<300): + cnt+=1 + print(cnt) + if not load_more_results(): break - - movie_elements = driver.find_elements(By.XPATH, "//div[contains(@class, 'lister-item mode-advanced')]") + +movie_elements = driver.find_element(By.XPATH, "/html/body/div[2]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/ul") +print("movie_list") + +html_content = movie_elements.get_attribute('outerHTML') +print("html movie_list") +soup = BeautifulSoup(html_content, 'html.parser') + +lst= soup.find_all("li", class_="ipc-metadata-list-summary-item") +print("list") +for i in lst: + org_title= i.find("h3",class_="ipc-title__text").text + try: + title=re.sub(r'\d+\.\s*', '', org_title) + except: + title="NA" + try: + year = i.find("span", class_="sc-b189961a-8 kLaxqf dli-title-metadata-item").text - for element in movie_elements: - soup = BeautifulSoup(element.get_attribute('outerHTML'), 'html.parser') - - try: - org_title = soup.find("h3", class_="lister-item-header").find("a").text - title = re.sub(r'\d+\.\s*', '', org_title) - except: - title = "NA" - - try: - year = soup.find("span", class_="lister-item-year").text - except: - year = "NA" - - try: - rating = soup.find("div", class_="ratings-bar").find("strong").text - except: - rating = "NA" - - try: - description = soup.find_all("p", class_="text-muted")[1].text.strip() - except: - description = "NA" - - all_movies.append({ - 'title': title, - 'type': "Tv-Series", - 'year': year, - 'rating': rating, - 'description': description - }) - - if all_movies: - save_to_csv(all_movies) - all_movies = [] - - driver.quit() - -# Streamlit App -def main(): - st.title("IMDb Scraper") - - if st.button("Scrape IMDb Data"): - with st.spinner("Scraping IMDb data..."): - scrape_imdb_data() - st.success("Data scraped successfully!") - - # Show the CSV file content - st.subheader("Scraped IMDb Data:") - filename = 'movies.csv' - if os.path.exists(filename): - with open(filename, 'r', encoding='utf-8') as file: - csv_content = file.read() - st.code(csv_content, language='csv') - else: - st.error("CSV file not found.") - -if __name__ == "__main__": - main() + except: + year="NA" + try: + rating = i.find("span", class_='ipc-rating-star ipc-rating-star--base ipc-rating-star--imdb ratingGroup--imdb-rating').text.split()[0] + except: + rating="NA" + try: + description = i.find("div", class_='ipc-html-content-inner-div').text + except: + description = "NA" + all_movies.append({ + 'title': title, + 'type':"Tv-Series", + 'year': year, + 'rating': rating, + 'description': description + }) + +print("saving started") +if all_movies: + save_to_csv(all_movies) +print("completed") +driver.quit() \ No newline at end of file