Quellcode durchsuchen

added source code for scraping and processing

Pranshu Raj vor 1 Jahr
Ursprung
Commit
8b3e1051ca

+ 34 - 0
Quine Package Quests/Job-finder/src/aggregate.py

@@ -0,0 +1,34 @@
+import os
+from datetime import datetime
+import pandas as pd
+
+
+directories = ['data/cleaned/indeed','data/cleaned/yc']
+date = str(datetime.now().strftime("%Y_%m_%d"))
+all_jobs = pd.DataFrame()
+
+def get_paths(directories):
+    '''
+    Generator function to yield all the paths of the files in the directories.'''
+    for directory in directories:
+        for filename in os.listdir(directory):
+            yield os.path.join(directory, filename)
+
+
+def get_data(path):
+    '''
+    Function to yield the data from the files.'''
+    df = pd.read_csv(path)
+    return df
+
+
+def save_aggregated_data(data, path):
+    data.to_csv(path, index=False)
+
+
+if __name__ == '__main__':
+    for path in get_paths(directories):
+        data = get_data(path)
+        all_jobs = pd.concat([all_jobs,data])
+    all_jobs=all_jobs.drop_duplicates()
+    all_jobs.to_csv(f'data/processed/{date}.csv', index=False)

+ 402 - 0
Quine Package Quests/Job-finder/src/analyse_jobs.ipynb

@@ -0,0 +1,402 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'/home/kakashi/intern-tracker/src/analysis'"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import os\n",
+    "os.getcwd()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>title</th>\n",
+       "      <th>company</th>\n",
+       "      <th>salary</th>\n",
+       "      <th>location</th>\n",
+       "      <th>link</th>\n",
+       "      <th>date</th>\n",
+       "      <th>query</th>\n",
+       "      <th>source</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Python Developer</td>\n",
+       "      <td>Infosys</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Pune, Maharashtra</td>\n",
+       "      <td>https://in.indeed.com/rc/clk?jk=b0a156d0bd60b7...</td>\n",
+       "      <td>Posted 2 days ago</td>\n",
+       "      <td>python developer</td>\n",
+       "      <td>indeed</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Junior Python Developer</td>\n",
+       "      <td>1E9 Advisors</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Aundh, Pune, Maharashtra</td>\n",
+       "      <td>https://in.indeed.com/rc/clk?jk=6227a113217cc2...</td>\n",
+       "      <td>Posted 24 days ago</td>\n",
+       "      <td>python developer</td>\n",
+       "      <td>indeed</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Entry-Level Software Developer</td>\n",
+       "      <td>Tantransh Solutions</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Bajaj Nagar, Nagpur, Maharashtra</td>\n",
+       "      <td>https://in.indeed.com/rc/clk?jk=43540174e00001...</td>\n",
+       "      <td>Posted 13 days ago</td>\n",
+       "      <td>python developer</td>\n",
+       "      <td>indeed</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Python Developer</td>\n",
+       "      <td>QuantGrade</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Remote in Noida, Uttar Pradesh</td>\n",
+       "      <td>https://in.indeed.com/rc/clk?jk=055ccbf93d79b7...</td>\n",
+       "      <td>Posted 7 days ago</td>\n",
+       "      <td>python developer</td>\n",
+       "      <td>indeed</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Python (Programming Language)-Application Deve...</td>\n",
+       "      <td>Accenture</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Bengaluru, Karnataka</td>\n",
+       "      <td>https://in.indeed.com/rc/clk?jk=62317f94ed4532...</td>\n",
+       "      <td>Today</td>\n",
+       "      <td>python developer</td>\n",
+       "      <td>indeed</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                               title              company  \\\n",
+       "0                                   Python Developer              Infosys   \n",
+       "1                            Junior Python Developer         1E9 Advisors   \n",
+       "2                     Entry-Level Software Developer  Tantransh Solutions   \n",
+       "3                                   Python Developer           QuantGrade   \n",
+       "4  Python (Programming Language)-Application Deve...            Accenture   \n",
+       "\n",
+       "  salary                          location  \\\n",
+       "0    NaN                 Pune, Maharashtra   \n",
+       "1    NaN          Aundh, Pune, Maharashtra   \n",
+       "2    NaN  Bajaj Nagar, Nagpur, Maharashtra   \n",
+       "3    NaN    Remote in Noida, Uttar Pradesh   \n",
+       "4    NaN              Bengaluru, Karnataka   \n",
+       "\n",
+       "                                                link                date  \\\n",
+       "0  https://in.indeed.com/rc/clk?jk=b0a156d0bd60b7...   Posted 2 days ago   \n",
+       "1  https://in.indeed.com/rc/clk?jk=6227a113217cc2...  Posted 24 days ago   \n",
+       "2  https://in.indeed.com/rc/clk?jk=43540174e00001...  Posted 13 days ago   \n",
+       "3  https://in.indeed.com/rc/clk?jk=055ccbf93d79b7...   Posted 7 days ago   \n",
+       "4  https://in.indeed.com/rc/clk?jk=62317f94ed4532...               Today   \n",
+       "\n",
+       "              query  source  \n",
+       "0  python developer  indeed  \n",
+       "1  python developer  indeed  \n",
+       "2  python developer  indeed  \n",
+       "3  python developer  indeed  \n",
+       "4  python developer  indeed  "
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data = pd.read_csv('/home/kakashi/intern-tracker/data/cleaned/indeed/2024_03_15.csv')\n",
+    "data.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 225 entries, 0 to 224\n",
+      "Data columns (total 8 columns):\n",
+      " #   Column    Non-Null Count  Dtype \n",
+      "---  ------    --------------  ----- \n",
+      " 0   title     218 non-null    object\n",
+      " 1   company   225 non-null    object\n",
+      " 2   salary    31 non-null     object\n",
+      " 3   location  225 non-null    object\n",
+      " 4   link      225 non-null    object\n",
+      " 5   date      225 non-null    object\n",
+      " 6   query     225 non-null    object\n",
+      " 7   source    225 non-null    object\n",
+      "dtypes: object(8)\n",
+      "memory usage: 14.2+ KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "data.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(225, 8)"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>count</th>\n",
+       "      <th>unique</th>\n",
+       "      <th>top</th>\n",
+       "      <th>freq</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>title</th>\n",
+       "      <td>218</td>\n",
+       "      <td>117</td>\n",
+       "      <td>Python Developer</td>\n",
+       "      <td>37</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>company</th>\n",
+       "      <td>225</td>\n",
+       "      <td>154</td>\n",
+       "      <td>Oracle</td>\n",
+       "      <td>11</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>salary</th>\n",
+       "      <td>31</td>\n",
+       "      <td>29</td>\n",
+       "      <td>₹15,000 - ₹70,000 a month</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>location</th>\n",
+       "      <td>225</td>\n",
+       "      <td>52</td>\n",
+       "      <td>Remote</td>\n",
+       "      <td>44</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>link</th>\n",
+       "      <td>225</td>\n",
+       "      <td>219</td>\n",
+       "      <td>https://in.indeed.comnan</td>\n",
+       "      <td>7</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>date</th>\n",
+       "      <td>225</td>\n",
+       "      <td>31</td>\n",
+       "      <td>Posted 30+ days ago</td>\n",
+       "      <td>52</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>query</th>\n",
+       "      <td>225</td>\n",
+       "      <td>3</td>\n",
+       "      <td>python developer</td>\n",
+       "      <td>75</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>source</th>\n",
+       "      <td>225</td>\n",
+       "      <td>1</td>\n",
+       "      <td>indeed</td>\n",
+       "      <td>225</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         count unique                        top freq\n",
+       "title      218    117           Python Developer   37\n",
+       "company    225    154                     Oracle   11\n",
+       "salary      31     29  ₹15,000 - ₹70,000 a month    2\n",
+       "location   225     52                     Remote   44\n",
+       "link       225    219   https://in.indeed.comnan    7\n",
+       "date       225     31        Posted 30+ days ago   52\n",
+       "query      225      3           python developer   75\n",
+       "source     225      1                     indeed  225"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.describe().T"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[nan '₹10,00,000 - ₹12,00,000 a year' 'Up to ₹50,000 a month'\n",
+      " '₹15,000 - ₹70,000 a month' '₹20,000 - ₹30,000 a month'\n",
+      " 'Up to ₹5,00,000 a year' '₹15,000 - ₹25,000 a month'\n",
+      " '₹41,40,000 - ₹62,10,000 a year' '₹8,00,000 - ₹18,00,000 a year'\n",
+      " 'Up to ₹60,000 a month' '₹30,000 - ₹45,000 a month'\n",
+      " '₹25,000 - ₹80,000 a month' '₹1,44,000 - ₹3,60,000 a year'\n",
+      " '₹40,000 - ₹60,000 a month' 'From ₹90,000 a month'\n",
+      " '₹90,000 - ₹1,00,000 a month' '₹10,00,000 - ₹26,00,000 a year'\n",
+      " '₹80,000 - ₹1,00,000 a month' '₹40,000 a month'\n",
+      " '₹30,00,000 - ₹35,00,000 a year' '₹4,00,000 - ₹5,00,000 a year'\n",
+      " '₹40,000 - ₹45,000 a month' '₹4,00,000 - ₹8,00,000 a year'\n",
+      " '₹90,000 - ₹1,60,000 a month' '₹10,00,000 - ₹25,00,000 a year'\n",
+      " '₹8,00,000 - ₹12,00,000 a year' '₹15,000 - ₹30,000 a month'\n",
+      " '₹35,000 - ₹65,000 a month' '₹30,000 - ₹50,000 a month'\n",
+      " '₹11,547.68 - ₹52,691.43 a month']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(data.salary.unique())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def process_salary(sample):\n",
+    "    salary = sample['salary']\n",
+    "    if salary !='NaN':\n",
+    "        if salary.endswith('year'):\n",
+    "            "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "production",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

+ 151 - 0
Quine Package Quests/Job-finder/src/grab_indeed.py

@@ -0,0 +1,151 @@
+import os
+import logging
+import random
+import time
+import datetime
+import pandas as pd
+from selenium import webdriver
+from bs4 import BeautifulSoup
+from urllib.parse import quote_plus
+from selenium.common.exceptions import TimeoutException
+
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    datefmt="%d_%m_%y %H:%M:%S",
+)
+# this works well
+driver = webdriver.Firefox()
+
+job_titles = [
+    "python developer",
+    "data analyst",
+    "machine learning engineer",
+    "software engineer",
+    "backend developer",
+    "devops engineer",
+    # "automation engineer",
+    # "network engineer",
+    # "vuejs developer",
+    "react developer",
+    # "nodejs developer",
+    # "frontend developer",
+    "full stack developer",
+    # "ui developer",
+    # "web application developer",
+    # "javascript engineer",
+    "mobile app developer",
+]
+
+# pagination limits
+num_pages = 1
+current_date = datetime.datetime.now().strftime("%Y_%m_%d")
+random.seed(int(datetime.datetime.now().strftime("%d")))
+start_time = time.time()
+jobs_df = pd.DataFrame()
+
+
+def get_job_description(link: str):
+    """
+    Get the job description using the job links scraped."""
+    try:
+        # open a new tab and switch to it (substitute for context management)
+        driver.execute_script('window.open("");')
+        driver.switch_to.window(driver.window_handles[-1])
+        # go to the page and parse it for description
+        driver.get(link)
+        soup = BeautifulSoup(driver.page_source, "html.parser")
+        description = soup.find("div", attrs={"id": "jobDescriptionText"}).text
+        # close the tab and go back to original window (job listings)
+        time.sleep(5+random.random()*3)
+        driver.close()
+        driver.switch_to.window(driver.window_handles[0])
+        time.sleep(2)
+        return description
+
+    except Exception as e:
+        logging.exception(f"Exception {e} occured while getting JD")
+        return None
+
+
+def get_jobs(soup):
+    containers = soup.findAll("div", class_="job_seen_beacon")
+
+    jobs = []
+    for container in containers:
+        job_title_element = container.find("h2", class_="jobTitle css-14z7akl eu4oa1w0")
+        company_element = container.find("span", {"data-testid": "company-name"})
+        salary_element = container.find(
+            "div", {"class": "metadata salary-snippet-container css-5zy3wz eu4oa1w0"}
+        )
+        location_element = container.find("div", {"data-testid": "text-location"})
+        date_element = container.find("span", {"class": "css-qvloho eu4oa1w0"})
+
+        job_title = job_title_element.text if job_title_element else None
+        company = company_element.text if company_element else None
+        salary = salary_element.text if salary_element else None
+        location = location_element.text if location_element else None
+        link = job_title_element.find("a")["href"]
+        link = "https://in.indeed.com" + str(link)
+        # date = list(date_element.children)[-1] if date_element else None
+        job_description = get_job_description(link)
+
+        jobs.append(
+            {
+                "title": job_title,
+                "company": company,
+                "salary": salary,
+                "location": location,
+                "link": link,
+                'description':job_description
+            }
+        )
+    return jobs
+
+
+for title in job_titles:
+    all_jobs = []
+    base_url = f"https://in.indeed.com/jobs?q={quote_plus(title)}&from=searchOnHP"
+
+    logging.info(f"Starting process - scrape {title} jobs from indeed")
+    time.sleep(20 + random.random() * 5)
+
+    for i in range(num_pages):
+        try:
+            driver.get(base_url + "&start=" + str(i * 10))
+        except TimeoutException:
+            logging.exception(f"Timeout while loading url")
+        # implicit wait - stops when page loads or time is over
+        driver.implicitly_wait(15)
+        # TODO: I should add in some random delay here
+        time.sleep(30 * random.random())
+        html = driver.page_source
+
+        soup = BeautifulSoup(html, "html.parser")
+
+        found_jobs = get_jobs(soup)
+        all_jobs.extend(found_jobs)
+
+    # Create directory if it doesn't exist
+    directory = os.path.join(os.getcwd(), f"data/raw/indeed")
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+        print(os.path.exists(directory))
+    logging.info(f"saving to {directory}")
+
+    fieldnames = all_jobs[0].keys()
+    df = pd.DataFrame(all_jobs)
+    df["query"] = title
+    df["source"] = "indeed"
+    jobs_df = pd.concat([jobs_df, df], ignore_index=True)
+    # changing definition of date to date the job was scraped
+    jobs_df["date"] = current_date
+    jobs_df.to_csv(f"data/raw/desc/{current_date}desc_in.csv", index=False)
+
+    logging.info(f"Done with {title}, scraped {len(all_jobs)} jobs")
+
+driver.quit()
+end_time = time.time()
+
+logging.info(f"Done in {end_time-start_time} seconds")

+ 58 - 0
Quine Package Quests/Job-finder/src/process_jd.py

@@ -0,0 +1,58 @@
+import pandas as pd
+import nltk
+import string
+import json
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+from nltk.stem import WordNetLemmatizer
+from sklearn.feature_extraction.text import TfidfVectorizer
+
+# Download the necessary NLTK data
+nltk.download('punkt')
+nltk.download('stopwords')
+nltk.download('wordnet')
+
+# Initialize the lemmatizer
+lemmatizer = WordNetLemmatizer()
+
+# Load the queries
+with open('data/queries.json', 'r') as f:
+    queries = json.load(f)
+query_words = [word for sublist in queries for word in sublist]
+
+def preprocess_text(text):
+    # Check if text is not NaN
+    if pd.isnull(text):
+        return ''
+    
+    # Convert to lower case
+    text = text.lower()
+    
+    # Remove punctuation
+    text = text.translate(str.maketrans('', '', string.punctuation))
+    
+    # Tokenize
+    words = word_tokenize(text)
+    
+    # Remove stopwords, lemmatize, and keep only words that are in query_words
+    words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords.words('english') and word in query_words]
+    
+    # Join words back into a string
+    text = ' '.join(words)
+    
+    return text
+
+# Load the data
+df = pd.read_csv('data/desc/2024_03_17desc_in.csv')
+
+# Preprocess the descriptions
+df['description'] = df['description'].apply(preprocess_text)
+
+# Initialize the vectorizer
+vectorizer = TfidfVectorizer()
+
+# Vectorize the descriptions
+df['description'] = list(vectorizer.fit_transform(df['description']).toarray())
+
+# Save the processed data
+df.to_csv('processed_data.csv', index=False)

+ 14 - 0
Quine Package Quests/Job-finder/src/scoring.py

@@ -0,0 +1,14 @@
+from sklearn.metrics.pairwise import cosine_similarity
+import pandas as pd
+
+df = pd.read_csv('your_file.csv')
+queries_df = pd.read_json('your_queries.json')
+job_descriptions = df['description'].tolist()
+queries = queries_df['query'].tolist()
+
+for query in queries:
+    query_vector = [query] * len(job_descriptions)
+    scores = cosine_similarity(job_descriptions, query_vector)
+    df['score_' + query] = scores
+
+df.to_csv('your_scored_file.csv', index=False)

+ 47 - 0
Quine Package Quests/Job-finder/src/scraping_template.py

@@ -0,0 +1,47 @@
+import time
+from selenium import webdriver
+from bs4 import BeautifulSoup
+
+# * Template for js enabled websites, will scrape whatever you want to scrape
+
+def get_data(soup):
+    """
+    Extract data from a BeautifulSoup object and return it.
+    This function should be customized for each specific scraping task.
+    """
+    # TODO: Implement this function
+    pass
+
+def scrape_pages(base_url, num_pages):
+    """
+    Scrape multiple pages of a website using Selenium and BeautifulSoup.
+    """
+
+    driver = webdriver.Firefox()
+    all_data = []
+
+    for i in range(num_pages):
+        driver.get(base_url + str(i*10))
+        driver.implicitly_wait(10)
+        html = driver.page_source
+        time.sleep(5)
+        soup = BeautifulSoup(html, 'html.parser')
+        page_data = get_data(soup)
+        all_data.extend(page_data)
+
+    driver.quit()
+
+    return all_data
+
+def main():
+    base_url = "https://www.example.com/page?start="
+    num_pages = 5
+    data = scrape_pages(base_url, num_pages)
+    for item in data:
+        print(item)
+
+# TODO : Implement some way of storing that data
+# If storing in csv, how do we check for and remove duplicates to reduce computation
+
+if __name__ == "__main__":
+    main()

+ 112 - 0
Quine Package Quests/Job-finder/src/yc.py

@@ -0,0 +1,112 @@
+import time
+import logging
+from selenium import webdriver
+from bs4 import BeautifulSoup
+import random
+import pandas as pd
+import datetime
+
+# todo : yc 16 has some problems with the links, processed twice, add a check
+
+
+date = datetime.datetime.now().strftime("%Y_%m_%d")
+BASE_URL = "https://www.ycombinator.com/jobs/role"
+driver = webdriver.Firefox()
+
+
+def get_job_description(link):
+    """
+    Get the job description using the job links scraped."""
+    try:
+        # open a new tab and switch to it (substitute for context management)
+        driver.execute_script('window.open("");')
+        driver.switch_to.window(driver.window_handles[-1])
+        # go to the page and parse it for description
+        driver.get(link)
+        soup = BeautifulSoup(driver.page_source, "html.parser")
+        description = soup.find("div", attrs={"class": "prose max-w-full"}).text
+        # close the tab and go back to original window (job listings)
+        time.sleep(10+random.random()*5)
+        driver.close()
+        driver.switch_to.window(driver.window_handles[0])
+        return description
+
+    except Exception as e:
+        logging.exception(f"Exception {e} occured while getting JD")
+        return None
+
+
+def get_data(soup):
+    containers = soup.findAll(
+        "div",
+        class_="mb-1 flex flex-col flex-nowrap items-center justify-between gap-y-2 md:flex-row md:gap-y-0",
+    )
+
+    jobs = []
+    for container in containers:
+        job_title_element = container.find("a", class_="font-semibold text-linkColor")
+
+        if job_title_element:
+            job_title = job_title_element.text
+            link = job_title_element["href"]
+            link = 'https://ycombinator.com' + link
+
+        company_element = container.find("span", class_="block font-bold md:inline")
+        if company_element:
+            company = company_element.text
+
+        location_element = container.find(
+            "div",
+            class_="border-r border-gray-300 px-2 first-of-type:pl-0 last-of-type:border-none last-of-type:pr-0",
+        )
+        if location_element:
+            location = location_element.text
+
+        date_posted_element = container.find(
+            "span", class_="hidden text-sm text-gray-400 md:inline"
+        )
+        if date_posted_element:
+            date_posted = date_posted_element.text.strip().split("(")[1].split(")")[0]
+
+        job_description = get_job_description(link)
+
+        jobs.append(
+            {
+                "title": job_title,
+                "company": company,
+                "location": location,
+                "link": link,
+                "description": job_description,
+                "date": date,
+            }
+        )
+    jobs = pd.DataFrame(jobs)
+    return jobs
+
+
+def scrape_pages(base_url, num_pages):
+    all_data = pd.DataFrame()
+
+    for _ in range(num_pages):
+        driver.get(base_url)
+        driver.implicitly_wait(15)
+        html = driver.page_source
+        time.sleep(3 + random.random() * 10)
+        soup = BeautifulSoup(html, "html.parser")
+        page_data = get_data(soup)
+        all_data = pd.concat([all_data, page_data])
+
+    driver.quit()
+    all_data["query"] = all_data["title"]
+    return all_data
+
+
+def main():
+    num_pages = 1
+    data = scrape_pages(BASE_URL, num_pages)
+    data["source"] = "yc"
+    data.to_csv(f"data/raw/desc/{str(date)}desc_yc.csv", index=False)
+
+
+if __name__ == "__main__":
+    main()