vor 1 Jahr · 8b3e1051ca
--- a/Quests/Job-finder/src/aggregate.py
+++ b/Quests/Job-finder/src/aggregate.py
@@ -0,0 +1,34 @@
 
				+import os
			
 
				+from datetime import datetime
			
 
				+import pandas as pd
			
 
				+
			
 
				+
			
 
				+directories = ['data/cleaned/indeed','data/cleaned/yc']
			
 
				+date = str(datetime.now().strftime("%Y_%m_%d"))
			
 
				+all_jobs = pd.DataFrame()
			
 
				+
			
 
				+def get_paths(directories):
			
 
				+    '''
			
 
				+    Generator function to yield all the paths of the files in the directories.'''
			
 
				+    for directory in directories:
			
 
				+        for filename in os.listdir(directory):
			
 
				+            yield os.path.join(directory, filename)
			
 
				+
			
 
				+
			
 
				+def get_data(path):
			
 
				+    '''
			
 
				+    Function to yield the data from the files.'''
			
 
				+    df = pd.read_csv(path)
			
 
				+    return df
			
 
				+
			
 
				+
			
 
				+def save_aggregated_data(data, path):
			
 
				+    data.to_csv(path, index=False)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    for path in get_paths(directories):
			
 
				+        data = get_data(path)
			
 
				+        all_jobs = pd.concat([all_jobs,data])
			
 
				+    all_jobs=all_jobs.drop_duplicates()
			
 
				+    all_jobs.to_csv(f'data/processed/{date}.csv', index=False)
			
--- a/Quests/Job-finder/src/analyse_jobs.ipynb
+++ b/Quests/Job-finder/src/analyse_jobs.ipynb
@@ -0,0 +1,402 @@
 
				+{
			
 
				+ "cells": [
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 1,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "import numpy as np\n",
			
 
				+    "import matplotlib.pyplot as plt\n",
			
 
				+    "import pandas as pd"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 4,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "data": {
			
 
				+      "text/plain": [
			
 
				+       "'/home/kakashi/intern-tracker/src/analysis'"
			
 
				+      ]
			
 
				+     },
			
 
				+     "execution_count": 4,
			
 
				+     "metadata": {},
			
 
				+     "output_type": "execute_result"
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "import os\n",
			
 
				+    "os.getcwd()"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 5,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "data": {
			
 
				+      "text/html": [
			
 
				+       "<div>\n",
			
 
				+       "<style scoped>\n",
			
 
				+       "    .dataframe tbody tr th:only-of-type {\n",
			
 
				+       "        vertical-align: middle;\n",
			
 
				+       "    }\n",
			
 
				+       "\n",
			
 
				+       "    .dataframe tbody tr th {\n",
			
 
				+       "        vertical-align: top;\n",
			
 
				+       "    }\n",
			
 
				+       "\n",
			
 
				+       "    .dataframe thead th {\n",
			
 
				+       "        text-align: right;\n",
			
 
				+       "    }\n",
			
 
				+       "</style>\n",
			
 
				+       "<table border=\"1\" class=\"dataframe\">\n",
			
 
				+       "  <thead>\n",
			
 
				+       "    <tr style=\"text-align: right;\">\n",
			
 
				+       "      <th></th>\n",
			
 
				+       "      <th>title</th>\n",
			
 
				+       "      <th>company</th>\n",
			
 
				+       "      <th>salary</th>\n",
			
 
				+       "      <th>location</th>\n",
			
 
				+       "      <th>link</th>\n",
			
 
				+       "      <th>date</th>\n",
			
 
				+       "      <th>query</th>\n",
			
 
				+       "      <th>source</th>\n",
			
 
				+       "    </tr>\n",
			
 
				+       "  </thead>\n",
			
 
				+       "  <tbody>\n",
			
 
				+       "    <tr>\n",
			
 
				+       "      <th>0</th>\n",
			
 
				+       "      <td>Python Developer</td>\n",
			
 
				+       "      <td>Infosys</td>\n",
			
 
				+       "      <td>NaN</td>\n",
			
 
				+       "      <td>Pune, Maharashtra</td>\n",
			
 
				+       "      <td>https://in.indeed.com/rc/clk?jk=b0a156d0bd60b7...</td>\n",
			
 
				+       "      <td>Posted 2 days ago</td>\n",
			
 
				+       "      <td>python developer</td>\n",
			
 
				+       "      <td>indeed</td>\n",
			
 
				+       "    </tr>\n",
			
 
				+       "    <tr>\n",
			
 
				+       "      <th>1</th>\n",
			
 
				+       "      <td>Junior Python Developer</td>\n",
			
 
				+       "      <td>1E9 Advisors</td>\n",
			
 
				+       "      <td>NaN</td>\n",
			
 
				+       "      <td>Aundh, Pune, Maharashtra</td>\n",
			
 
				+       "      <td>https://in.indeed.com/rc/clk?jk=6227a113217cc2...</td>\n",
			
 
				+       "      <td>Posted 24 days ago</td>\n",
			
 
				+       "      <td>python developer</td>\n",
			
 
				+       "      <td>indeed</td>\n",
			
 
				+       "    </tr>\n",
			
 
				+       "    <tr>\n",
			
 
				+       "      <th>2</th>\n",
			
 
				+       "      <td>Entry-Level Software Developer</td>\n",
			
 
				+       "      <td>Tantransh Solutions</td>\n",
			
 
				+       "      <td>NaN</td>\n",
			
 
				+       "      <td>Bajaj Nagar, Nagpur, Maharashtra</td>\n",
			
 
				+       "      <td>https://in.indeed.com/rc/clk?jk=43540174e00001...</td>\n",
			
 
				+       "      <td>Posted 13 days ago</td>\n",
			
 
				+       "      <td>python developer</td>\n",
			
 
				+       "      <td>indeed</td>\n",
			
 
				+       "    </tr>\n",
			
 
				+       "    <tr>\n",
			
 
				+       "      <th>3</th>\n",
			
 
				+       "      <td>Python Developer</td>\n",
			
 
				+       "      <td>QuantGrade</td>\n",
			
 
				+       "      <td>NaN</td>\n",
			
 
				+       "      <td>Remote in Noida, Uttar Pradesh</td>\n",
			
 
				+       "      <td>https://in.indeed.com/rc/clk?jk=055ccbf93d79b7...</td>\n",
			
 
				+       "      <td>Posted 7 days ago</td>\n",
			
 
				+       "      <td>python developer</td>\n",
			
 
				+       "      <td>indeed</td>\n",
			
 
				+       "    </tr>\n",
			
 
				+       "    <tr>\n",
			
 
				+       "      <th>4</th>\n",
			
 
				+       "      <td>Python (Programming Language)-Application Deve...</td>\n",
			
 
				+       "      <td>Accenture</td>\n",
			
 
				+       "      <td>NaN</td>\n",
			
 
				+       "      <td>Bengaluru, Karnataka</td>\n",
			
 
				+       "      <td>https://in.indeed.com/rc/clk?jk=62317f94ed4532...</td>\n",
			
 
				+       "      <td>Today</td>\n",
			
 
				+       "      <td>python developer</td>\n",
			
 
				+       "      <td>indeed</td>\n",
			
 
				+       "    </tr>\n",
			
 
				+       "  </tbody>\n",
			
 
				+       "</table>\n",
			
 
				+       "</div>"
			
 
				+      ],
			
 
				+      "text/plain": [
			
 
				+       "                                               title              company  \\\n",
			
 
				+       "0                                   Python Developer              Infosys   \n",
			
 
				+       "1                            Junior Python Developer         1E9 Advisors   \n",
			
 
				+       "2                     Entry-Level Software Developer  Tantransh Solutions   \n",
			
 
				+       "3                                   Python Developer           QuantGrade   \n",
			
 
				+       "4  Python (Programming Language)-Application Deve...            Accenture   \n",
			
 
				+       "\n",
			
 
				+       "  salary                          location  \\\n",
			
 
				+       "0    NaN                 Pune, Maharashtra   \n",
			
 
				+       "1    NaN          Aundh, Pune, Maharashtra   \n",
			
 
				+       "2    NaN  Bajaj Nagar, Nagpur, Maharashtra   \n",
			
 
				+       "3    NaN    Remote in Noida, Uttar Pradesh   \n",
			
 
				+       "4    NaN              Bengaluru, Karnataka   \n",
			
 
				+       "\n",
			
 
				+       "                                                link                date  \\\n",
			
 
				+       "0  https://in.indeed.com/rc/clk?jk=b0a156d0bd60b7...   Posted 2 days ago   \n",
			
 
				+       "1  https://in.indeed.com/rc/clk?jk=6227a113217cc2...  Posted 24 days ago   \n",
			
 
				+       "2  https://in.indeed.com/rc/clk?jk=43540174e00001...  Posted 13 days ago   \n",
			
 
				+       "3  https://in.indeed.com/rc/clk?jk=055ccbf93d79b7...   Posted 7 days ago   \n",
			
 
				+       "4  https://in.indeed.com/rc/clk?jk=62317f94ed4532...               Today   \n",
			
 
				+       "\n",
			
 
				+       "              query  source  \n",
			
 
				+       "0  python developer  indeed  \n",
			
 
				+       "1  python developer  indeed  \n",
			
 
				+       "2  python developer  indeed  \n",
			
 
				+       "3  python developer  indeed  \n",
			
 
				+       "4  python developer  indeed  "
			
 
				+      ]
			
 
				+     },
			
 
				+     "execution_count": 5,
			
 
				+     "metadata": {},
			
 
				+     "output_type": "execute_result"
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "data = pd.read_csv('/home/kakashi/intern-tracker/data/cleaned/indeed/2024_03_15.csv')\n",
			
 
				+    "data.head()"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 6,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "<class 'pandas.core.frame.DataFrame'>\n",
			
 
				+      "RangeIndex: 225 entries, 0 to 224\n",
			
 
				+      "Data columns (total 8 columns):\n",
			
 
				+      " #   Column    Non-Null Count  Dtype \n",
			
 
				+      "---  ------    --------------  ----- \n",
			
 
				+      " 0   title     218 non-null    object\n",
			
 
				+      " 1   company   225 non-null    object\n",
			
 
				+      " 2   salary    31 non-null     object\n",
			
 
				+      " 3   location  225 non-null    object\n",
			
 
				+      " 4   link      225 non-null    object\n",
			
 
				+      " 5   date      225 non-null    object\n",
			
 
				+      " 6   query     225 non-null    object\n",
			
 
				+      " 7   source    225 non-null    object\n",
			
 
				+      "dtypes: object(8)\n",
			
 
				+      "memory usage: 14.2+ KB\n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "data.info()"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 9,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "data": {
			
 
				+      "text/plain": [
			
 
				+       "(225, 8)"
			
 
				+      ]
			
 
				+     },
			
 
				+     "execution_count": 9,
			
 
				+     "metadata": {},
			
 
				+     "output_type": "execute_result"
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "data.shape"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 8,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "data": {
			
 
				+      "text/html": [
			
 
				+       "<div>\n",
			
 
				+       "<style scoped>\n",
			
 
				+       "    .dataframe tbody tr th:only-of-type {\n",
			
 
				+       "        vertical-align: middle;\n",
			
 
				+       "    }\n",
			
 
				+       "\n",
			
 
				+       "    .dataframe tbody tr th {\n",
			
 
				+       "        vertical-align: top;\n",
			
 
				+       "    }\n",
			
 
				+       "\n",
			
 
				+       "    .dataframe thead th {\n",
			
 
				+       "        text-align: right;\n",
			
 
				+       "    }\n",
			
 
				+       "</style>\n",
			
 
				+       "<table border=\"1\" class=\"dataframe\">\n",
			
 
				+       "  <thead>\n",
			
 
				+       "    <tr style=\"text-align: right;\">\n",
			
 
				+       "      <th></th>\n",
			
 
				+       "      <th>count</th>\n",
			
 
				+       "      <th>unique</th>\n",
			
 
				+       "      <th>top</th>\n",
			
 
				+       "      <th>freq</th>\n",
			
 
				+       "    </tr>\n",
			
 
				+       "  </thead>\n",
			
 
				+       "  <tbody>\n",
			
 
				+       "    <tr>\n",
			
 
				+       "      <th>title</th>\n",
			
 
				+       "      <td>218</td>\n",
			
 
				+       "      <td>117</td>\n",
			
 
				+       "      <td>Python Developer</td>\n",
			
 
				+       "      <td>37</td>\n",
			
 
				+       "    </tr>\n",
			
 
				+       "    <tr>\n",
			
 
				+       "      <th>company</th>\n",
			
 
				+       "      <td>225</td>\n",
			
 
				+       "      <td>154</td>\n",
			
 
				+       "      <td>Oracle</td>\n",
			
 
				+       "      <td>11</td>\n",
			
 
				+       "    </tr>\n",
			
 
				+       "    <tr>\n",
			
 
				+       "      <th>salary</th>\n",
			
 
				+       "      <td>31</td>\n",
			
 
				+       "      <td>29</td>\n",
			
 
				+       "      <td>₹15,000 - ₹70,000 a month</td>\n",
			
 
				+       "      <td>2</td>\n",
			
 
				+       "    </tr>\n",
			
 
				+       "    <tr>\n",
			
 
				+       "      <th>location</th>\n",
			
 
				+       "      <td>225</td>\n",
			
 
				+       "      <td>52</td>\n",
			
 
				+       "      <td>Remote</td>\n",
			
 
				+       "      <td>44</td>\n",
			
 
				+       "    </tr>\n",
			
 
				+       "    <tr>\n",
			
 
				+       "      <th>link</th>\n",
			
 
				+       "      <td>225</td>\n",
			
 
				+       "      <td>219</td>\n",
			
 
				+       "      <td>https://in.indeed.comnan</td>\n",
			
 
				+       "      <td>7</td>\n",
			
 
				+       "    </tr>\n",
			
 
				+       "    <tr>\n",
			
 
				+       "      <th>date</th>\n",
			
 
				+       "      <td>225</td>\n",
			
 
				+       "      <td>31</td>\n",
			
 
				+       "      <td>Posted 30+ days ago</td>\n",
			
 
				+       "      <td>52</td>\n",
			
 
				+       "    </tr>\n",
			
 
				+       "    <tr>\n",
			
 
				+       "      <th>query</th>\n",
			
 
				+       "      <td>225</td>\n",
			
 
				+       "      <td>3</td>\n",
			
 
				+       "      <td>python developer</td>\n",
			
 
				+       "      <td>75</td>\n",
			
 
				+       "    </tr>\n",
			
 
				+       "    <tr>\n",
			
 
				+       "      <th>source</th>\n",
			
 
				+       "      <td>225</td>\n",
			
 
				+       "      <td>1</td>\n",
			
 
				+       "      <td>indeed</td>\n",
			
 
				+       "      <td>225</td>\n",
			
 
				+       "    </tr>\n",
			
 
				+       "  </tbody>\n",
			
 
				+       "</table>\n",
			
 
				+       "</div>"
			
 
				+      ],
			
 
				+      "text/plain": [
			
 
				+       "         count unique                        top freq\n",
			
 
				+       "title      218    117           Python Developer   37\n",
			
 
				+       "company    225    154                     Oracle   11\n",
			
 
				+       "salary      31     29  ₹15,000 - ₹70,000 a month    2\n",
			
 
				+       "location   225     52                     Remote   44\n",
			
 
				+       "link       225    219   https://in.indeed.comnan    7\n",
			
 
				+       "date       225     31        Posted 30+ days ago   52\n",
			
 
				+       "query      225      3           python developer   75\n",
			
 
				+       "source     225      1                     indeed  225"
			
 
				+      ]
			
 
				+     },
			
 
				+     "execution_count": 8,
			
 
				+     "metadata": {},
			
 
				+     "output_type": "execute_result"
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "data.describe().T"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 10,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "[nan '₹10,00,000 - ₹12,00,000 a year' 'Up to ₹50,000 a month'\n",
			
 
				+      " '₹15,000 - ₹70,000 a month' '₹20,000 - ₹30,000 a month'\n",
			
 
				+      " 'Up to ₹5,00,000 a year' '₹15,000 - ₹25,000 a month'\n",
			
 
				+      " '₹41,40,000 - ₹62,10,000 a year' '₹8,00,000 - ₹18,00,000 a year'\n",
			
 
				+      " 'Up to ₹60,000 a month' '₹30,000 - ₹45,000 a month'\n",
			
 
				+      " '₹25,000 - ₹80,000 a month' '₹1,44,000 - ₹3,60,000 a year'\n",
			
 
				+      " '₹40,000 - ₹60,000 a month' 'From ₹90,000 a month'\n",
			
 
				+      " '₹90,000 - ₹1,00,000 a month' '₹10,00,000 - ₹26,00,000 a year'\n",
			
 
				+      " '₹80,000 - ₹1,00,000 a month' '₹40,000 a month'\n",
			
 
				+      " '₹30,00,000 - ₹35,00,000 a year' '₹4,00,000 - ₹5,00,000 a year'\n",
			
 
				+      " '₹40,000 - ₹45,000 a month' '₹4,00,000 - ₹8,00,000 a year'\n",
			
 
				+      " '₹90,000 - ₹1,60,000 a month' '₹10,00,000 - ₹25,00,000 a year'\n",
			
 
				+      " '₹8,00,000 - ₹12,00,000 a year' '₹15,000 - ₹30,000 a month'\n",
			
 
				+      " '₹35,000 - ₹65,000 a month' '₹30,000 - ₹50,000 a month'\n",
			
 
				+      " '₹11,547.68 - ₹52,691.43 a month']\n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "print(data.salary.unique())"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "def process_salary(sample):\n",
			
 
				+    "    salary = sample['salary']\n",
			
 
				+    "    if salary !='NaN':\n",
			
 
				+    "        if salary.endswith('year'):\n",
			
 
				+    "            "
			
 
				+   ]
			
 
				+  }
			
 
				+ ],
			
 
				+ "metadata": {
			
 
				+  "kernelspec": {
			
 
				+   "display_name": "production",
			
 
				+   "language": "python",
			
 
				+   "name": "python3"
			
 
				+  },
			
 
				+  "language_info": {
			
 
				+   "codemirror_mode": {
			
 
				+    "name": "ipython",
			
 
				+    "version": 3
			
 
				+   },
			
 
				+   "file_extension": ".py",
			
 
				+   "mimetype": "text/x-python",
			
 
				+   "name": "python",
			
 
				+   "nbconvert_exporter": "python",
			
 
				+   "pygments_lexer": "ipython3",
			
 
				+   "version": "3.8.16"
			
 
				+  }
			
 
				+ },
			
 
				+ "nbformat": 4,
			
 
				+ "nbformat_minor": 2
			
 
				+}
			
--- a/Quests/Job-finder/src/grab_indeed.py
+++ b/Quests/Job-finder/src/grab_indeed.py
@@ -0,0 +1,151 @@
 
				+import os
			
 
				+import logging
			
 
				+import random
			
 
				+import time
			
 
				+import datetime
			
 
				+import pandas as pd
			
 
				+from selenium import webdriver
			
 
				+from bs4 import BeautifulSoup
			
 
				+from urllib.parse import quote_plus
			
 
				+from selenium.common.exceptions import TimeoutException
			
 
				+
			
 
				+
			
 
				+logging.basicConfig(
			
 
				+    level=logging.INFO,
			
 
				+    format="%(asctime)s - %(levelname)s - %(message)s",
			
 
				+    datefmt="%d_%m_%y %H:%M:%S",
			
 
				+)
			
 
				+# this works well
			
 
				+driver = webdriver.Firefox()
			
 
				+
			
 
				+job_titles = [
			
 
				+    "python developer",
			
 
				+    "data analyst",
			
 
				+    "machine learning engineer",
			
 
				+    "software engineer",
			
 
				+    "backend developer",
			
 
				+    "devops engineer",
			
 
				+    # "automation engineer",
			
 
				+    # "network engineer",
			
 
				+    # "vuejs developer",
			
 
				+    "react developer",
			
 
				+    # "nodejs developer",
			
 
				+    # "frontend developer",
			
 
				+    "full stack developer",
			
 
				+    # "ui developer",
			
 
				+    # "web application developer",
			
 
				+    # "javascript engineer",
			
 
				+    "mobile app developer",
			
 
				+]
			
 
				+
			
 
				+# pagination limits
			
 
				+num_pages = 1
			
 
				+current_date = datetime.datetime.now().strftime("%Y_%m_%d")
			
 
				+random.seed(int(datetime.datetime.now().strftime("%d")))
			
 
				+start_time = time.time()
			
 
				+jobs_df = pd.DataFrame()
			
 
				+
			
 
				+
			
 
				+def get_job_description(link: str):
			
 
				+    """
			
 
				+    Get the job description using the job links scraped."""
			
 
				+    try:
			
 
				+        # open a new tab and switch to it (substitute for context management)
			
 
				+        driver.execute_script('window.open("");')
			
 
				+        driver.switch_to.window(driver.window_handles[-1])
			
 
				+        # go to the page and parse it for description
			
 
				+        driver.get(link)
			
 
				+        soup = BeautifulSoup(driver.page_source, "html.parser")
			
 
				+        description = soup.find("div", attrs={"id": "jobDescriptionText"}).text
			
 
				+        # close the tab and go back to original window (job listings)
			
 
				+        time.sleep(5+random.random()*3)
			
 
				+        driver.close()
			
 
				+        driver.switch_to.window(driver.window_handles[0])
			
 
				+        time.sleep(2)
			
 
				+        return description
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        logging.exception(f"Exception {e} occured while getting JD")
			
 
				+        return None
			
 
				+
			
 
				+
			
 
				+def get_jobs(soup):
			
 
				+    containers = soup.findAll("div", class_="job_seen_beacon")
			
 
				+
			
 
				+    jobs = []
			
 
				+    for container in containers:
			
 
				+        job_title_element = container.find("h2", class_="jobTitle css-14z7akl eu4oa1w0")
			
 
				+        company_element = container.find("span", {"data-testid": "company-name"})
			
 
				+        salary_element = container.find(
			
 
				+            "div", {"class": "metadata salary-snippet-container css-5zy3wz eu4oa1w0"}
			
 
				+        )
			
 
				+        location_element = container.find("div", {"data-testid": "text-location"})
			
 
				+        date_element = container.find("span", {"class": "css-qvloho eu4oa1w0"})
			
 
				+
			
 
				+        job_title = job_title_element.text if job_title_element else None
			
 
				+        company = company_element.text if company_element else None
			
 
				+        salary = salary_element.text if salary_element else None
			
 
				+        location = location_element.text if location_element else None
			
 
				+        link = job_title_element.find("a")["href"]
			
 
				+        link = "https://in.indeed.com" + str(link)
			
 
				+        # date = list(date_element.children)[-1] if date_element else None
			
 
				+        job_description = get_job_description(link)
			
 
				+
			
 
				+        jobs.append(
			
 
				+            {
			
 
				+                "title": job_title,
			
 
				+                "company": company,
			
 
				+                "salary": salary,
			
 
				+                "location": location,
			
 
				+                "link": link,
			
 
				+                'description':job_description
			
 
				+            }
			
 
				+        )
			
 
				+    return jobs
			
 
				+
			
 
				+
			
 
				+for title in job_titles:
			
 
				+    all_jobs = []
			
 
				+    base_url = f"https://in.indeed.com/jobs?q={quote_plus(title)}&from=searchOnHP"
			
 
				+
			
 
				+    logging.info(f"Starting process - scrape {title} jobs from indeed")
			
 
				+    time.sleep(20 + random.random() * 5)
			
 
				+
			
 
				+    for i in range(num_pages):
			
 
				+        try:
			
 
				+            driver.get(base_url + "&start=" + str(i * 10))
			
 
				+        except TimeoutException:
			
 
				+            logging.exception(f"Timeout while loading url")
			
 
				+        # implicit wait - stops when page loads or time is over
			
 
				+        driver.implicitly_wait(15)
			
 
				+        # TODO: I should add in some random delay here
			
 
				+        time.sleep(30 * random.random())
			
 
				+        html = driver.page_source
			
 
				+
			
 
				+        soup = BeautifulSoup(html, "html.parser")
			
 
				+
			
 
				+        found_jobs = get_jobs(soup)
			
 
				+        all_jobs.extend(found_jobs)
			
 
				+
			
 
				+    # Create directory if it doesn't exist
			
 
				+    directory = os.path.join(os.getcwd(), f"data/raw/indeed")
			
 
				+    if not os.path.exists(directory):
			
 
				+        os.makedirs(directory)
			
 
				+        print(os.path.exists(directory))
			
 
				+    logging.info(f"saving to {directory}")
			
 
				+
			
 
				+    fieldnames = all_jobs[0].keys()
			
 
				+    df = pd.DataFrame(all_jobs)
			
 
				+    df["query"] = title
			
 
				+    df["source"] = "indeed"
			
 
				+    jobs_df = pd.concat([jobs_df, df], ignore_index=True)
			
 
				+    # changing definition of date to date the job was scraped
			
 
				+    jobs_df["date"] = current_date
			
 
				+    jobs_df.to_csv(f"data/raw/desc/{current_date}desc_in.csv", index=False)
			
 
				+
			
 
				+    logging.info(f"Done with {title}, scraped {len(all_jobs)} jobs")
			
 
				+
			
 
				+driver.quit()
			
 
				+end_time = time.time()
			
 
				+
			
 
				+logging.info(f"Done in {end_time-start_time} seconds")
			
--- a/Quests/Job-finder/src/process_jd.py
+++ b/Quests/Job-finder/src/process_jd.py
@@ -0,0 +1,58 @@
 
				+import pandas as pd
			
 
				+import nltk
			
 
				+import string
			
 
				+import json
			
 
				+from nltk.corpus import stopwords
			
 
				+from nltk.tokenize import word_tokenize
			
 
				+from nltk.stem import WordNetLemmatizer
			
 
				+from sklearn.feature_extraction.text import TfidfVectorizer
			
 
				+
			
 
				+# Download the necessary NLTK data
			
 
				+nltk.download('punkt')
			
 
				+nltk.download('stopwords')
			
 
				+nltk.download('wordnet')
			
 
				+
			
 
				+# Initialize the lemmatizer
			
 
				+lemmatizer = WordNetLemmatizer()
			
 
				+
			
 
				+# Load the queries
			
 
				+with open('data/queries.json', 'r') as f:
			
 
				+    queries = json.load(f)
			
 
				+query_words = [word for sublist in queries for word in sublist]
			
 
				+
			
 
				+def preprocess_text(text):
			
 
				+    # Check if text is not NaN
			
 
				+    if pd.isnull(text):
			
 
				+        return ''
			
 
				+    
			
 
				+    # Convert to lower case
			
 
				+    text = text.lower()
			
 
				+    
			
 
				+    # Remove punctuation
			
 
				+    text = text.translate(str.maketrans('', '', string.punctuation))
			
 
				+    
			
 
				+    # Tokenize
			
 
				+    words = word_tokenize(text)
			
 
				+    
			
 
				+    # Remove stopwords, lemmatize, and keep only words that are in query_words
			
 
				+    words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords.words('english') and word in query_words]
			
 
				+    
			
 
				+    # Join words back into a string
			
 
				+    text = ' '.join(words)
			
 
				+    
			
 
				+    return text
			
 
				+
			
 
				+# Load the data
			
 
				+df = pd.read_csv('data/desc/2024_03_17desc_in.csv')
			
 
				+
			
 
				+# Preprocess the descriptions
			
 
				+df['description'] = df['description'].apply(preprocess_text)
			
 
				+
			
 
				+# Initialize the vectorizer
			
 
				+vectorizer = TfidfVectorizer()
			
 
				+
			
 
				+# Vectorize the descriptions
			
 
				+df['description'] = list(vectorizer.fit_transform(df['description']).toarray())
			
 
				+
			
 
				+# Save the processed data
			
 
				+df.to_csv('processed_data.csv', index=False)
			
--- a/Quests/Job-finder/src/scoring.py
+++ b/Quests/Job-finder/src/scoring.py
@@ -0,0 +1,14 @@
 
				+from sklearn.metrics.pairwise import cosine_similarity
			
 
				+import pandas as pd
			
 
				+
			
 
				+df = pd.read_csv('your_file.csv')
			
 
				+queries_df = pd.read_json('your_queries.json')
			
 
				+job_descriptions = df['description'].tolist()
			
 
				+queries = queries_df['query'].tolist()
			
 
				+
			
 
				+for query in queries:
			
 
				+    query_vector = [query] * len(job_descriptions)
			
 
				+    scores = cosine_similarity(job_descriptions, query_vector)
			
 
				+    df['score_' + query] = scores
			
 
				+
			
 
				+df.to_csv('your_scored_file.csv', index=False)
			
--- a/Quests/Job-finder/src/scraping_template.py
+++ b/Quests/Job-finder/src/scraping_template.py
@@ -0,0 +1,47 @@
 
				+import time
			
 
				+from selenium import webdriver
			
 
				+from bs4 import BeautifulSoup
			
 
				+
			
 
				+# * Template for js enabled websites, will scrape whatever you want to scrape
			
 
				+
			
 
				+def get_data(soup):
			
 
				+    """
			
 
				+    Extract data from a BeautifulSoup object and return it.
			
 
				+    This function should be customized for each specific scraping task.
			
 
				+    """
			
 
				+    # TODO: Implement this function
			
 
				+    pass
			
 
				+
			
 
				+def scrape_pages(base_url, num_pages):
			
 
				+    """
			
 
				+    Scrape multiple pages of a website using Selenium and BeautifulSoup.
			
 
				+    """
			
 
				+
			
 
				+    driver = webdriver.Firefox()
			
 
				+    all_data = []
			
 
				+
			
 
				+    for i in range(num_pages):
			
 
				+        driver.get(base_url + str(i*10))
			
 
				+        driver.implicitly_wait(10)
			
 
				+        html = driver.page_source
			
 
				+        time.sleep(5)
			
 
				+        soup = BeautifulSoup(html, 'html.parser')
			
 
				+        page_data = get_data(soup)
			
 
				+        all_data.extend(page_data)
			
 
				+
			
 
				+    driver.quit()
			
 
				+
			
 
				+    return all_data
			
 
				+
			
 
				+def main():
			
 
				+    base_url = "https://www.example.com/page?start="
			
 
				+    num_pages = 5
			
 
				+    data = scrape_pages(base_url, num_pages)
			
 
				+    for item in data:
			
 
				+        print(item)
			
 
				+
			
 
				+# TODO : Implement some way of storing that data
			
 
				+# If storing in csv, how do we check for and remove duplicates to reduce computation
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/Quests/Job-finder/src/yc.py
+++ b/Quests/Job-finder/src/yc.py
@@ -0,0 +1,112 @@
 
				+import time
			
 
				+import logging
			
 
				+from selenium import webdriver
			
 
				+from bs4 import BeautifulSoup
			
 
				+import random
			
 
				+import pandas as pd
			
 
				+import datetime
			
 
				+
			
 
				+# todo : yc 16 has some problems with the links, processed twice, add a check
			
 
				+
			
 
				+
			
 
				+date = datetime.datetime.now().strftime("%Y_%m_%d")
			
 
				+BASE_URL = "https://www.ycombinator.com/jobs/role"
			
 
				+driver = webdriver.Firefox()
			
 
				+
			
 
				+
			
 
				+def get_job_description(link):
			
 
				+    """
			
 
				+    Get the job description using the job links scraped."""
			
 
				+    try:
			
 
				+        # open a new tab and switch to it (substitute for context management)
			
 
				+        driver.execute_script('window.open("");')
			
 
				+        driver.switch_to.window(driver.window_handles[-1])
			
 
				+        # go to the page and parse it for description
			
 
				+        driver.get(link)
			
 
				+        soup = BeautifulSoup(driver.page_source, "html.parser")
			
 
				+        description = soup.find("div", attrs={"class": "prose max-w-full"}).text
			
 
				+        # close the tab and go back to original window (job listings)
			
 
				+        time.sleep(10+random.random()*5)
			
 
				+        driver.close()
			
 
				+        driver.switch_to.window(driver.window_handles[0])
			
 
				+        return description
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        logging.exception(f"Exception {e} occured while getting JD")
			
 
				+        return None
			
 
				+
			
 
				+
			
 
				+def get_data(soup):
			
 
				+    containers = soup.findAll(
			
 
				+        "div",
			
 
				+        class_="mb-1 flex flex-col flex-nowrap items-center justify-between gap-y-2 md:flex-row md:gap-y-0",
			
 
				+    )
			
 
				+
			
 
				+    jobs = []
			
 
				+    for container in containers:
			
 
				+        job_title_element = container.find("a", class_="font-semibold text-linkColor")
			
 
				+
			
 
				+        if job_title_element:
			
 
				+            job_title = job_title_element.text
			
 
				+            link = job_title_element["href"]
			
 
				+            link = 'https://ycombinator.com' + link
			
 
				+
			
 
				+        company_element = container.find("span", class_="block font-bold md:inline")
			
 
				+        if company_element:
			
 
				+            company = company_element.text
			
 
				+
			
 
				+        location_element = container.find(
			
 
				+            "div",
			
 
				+            class_="border-r border-gray-300 px-2 first-of-type:pl-0 last-of-type:border-none last-of-type:pr-0",
			
 
				+        )
			
 
				+        if location_element:
			
 
				+            location = location_element.text
			
 
				+
			
 
				+        date_posted_element = container.find(
			
 
				+            "span", class_="hidden text-sm text-gray-400 md:inline"
			
 
				+        )
			
 
				+        if date_posted_element:
			
 
				+            date_posted = date_posted_element.text.strip().split("(")[1].split(")")[0]
			
 
				+
			
 
				+        job_description = get_job_description(link)
			
 
				+
			
 
				+        jobs.append(
			
 
				+            {
			
 
				+                "title": job_title,
			
 
				+                "company": company,
			
 
				+                "location": location,
			
 
				+                "link": link,
			
 
				+                "description": job_description,
			
 
				+                "date": date,
			
 
				+            }
			
 
				+        )
			
 
				+    jobs = pd.DataFrame(jobs)
			
 
				+    return jobs
			
 
				+
			
 
				+
			
 
				+def scrape_pages(base_url, num_pages):
			
 
				+    all_data = pd.DataFrame()
			
 
				+
			
 
				+    for _ in range(num_pages):
			
 
				+        driver.get(base_url)
			
 
				+        driver.implicitly_wait(15)
			
 
				+        html = driver.page_source
			
 
				+        time.sleep(3 + random.random() * 10)
			
 
				+        soup = BeautifulSoup(html, "html.parser")
			
 
				+        page_data = get_data(soup)
			
 
				+        all_data = pd.concat([all_data, page_data])
			
 
				+
			
 
				+    driver.quit()
			
 
				+    all_data["query"] = all_data["title"]
			
 
				+    return all_data
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    num_pages = 1
			
 
				+    data = scrape_pages(BASE_URL, num_pages)
			
 
				+    data["source"] = "yc"
			
 
				+    data.to_csv(f"data/raw/desc/{str(date)}desc_yc.csv", index=False)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()