Vulnerability History
Date | High Risk | Low Risk |
---|---|---|
2025-05-29 | 1 | 1 |
Audit Report Details
3527
Lines of Code
2
Open
0
Resolved
🚨 High Risk Vulnerabilities
⚠️ Low Risk Vulnerabilities
Vulnerable Code:
1# Repo Tree (Python files only, excluding .gitignored files)23├── __init__.py4├── backend5│ ├── __init__.py6│ ├── alembic7│ │ ├── env.py8│ │ └── versions9│ │ ├── 6240656d52f6_add_job_title_column.py10│ │ ├── b240c664ed46_change_user_email_id_to_varchar.py11│ │ └── c256d0279ea6_rename_user_email_table_to_plural.py12│ ├── config.py13│ ├── constants.py14│ ├── database.py15│ ├── db16│ │ ├── companies.py17│ │ ├── company_jobs.py18│ │ ├── job_status.py19│ │ ├── job_titles.py20│ │ ├── processing_tasks.py21│ │ ├── user_emails.py22│ │ ├── user_job_status.py23│ │ ├── user_jobs.py24│ │ ├── user_session.py25│ │ ├── users.py26│ │ └── utils27│ │ ├── user_email_utils.py28│ │ └── user_utils.py29│ ├── email_query_filters30│ ├── main.py31│ ├── routes32│ │ ├── auth_routes.py33│ │ ├── email_routes.py34│ │ ├── file_routes.py35│ │ ├── start_date_routes.py36│ │ └── users_routes.py37│ ├── session38│ │ └── session_layer.py39│ ├── start_date40│ │ └── storage.py41│ ├── static42│ ├── templates43│ ├── tests44│ │ ├── __init__.py45│ │ ├── conftest.py46│ │ ├── routes47│ │ │ ├── __init__.py48│ │ │ ├── conftest.py49│ │ │ └── test_email_routes.py50│ │ ├── test_config_utils.py51│ │ ├── test_constants.py52│ │ ├── test_email_utils.py53│ │ ├── test_filter_schema.py54│ │ └── test_filter_utils.py55│ └── utils56│ ├── auth_utils.py57│ ├── config_utils.py58│ ├── cookie_utils.py59│ ├── email_utils.py60│ ├── file_utils.py61│ ├── filter_utils.py62│ └── llm_utils.py63├── docs64│ └── use_cases65├── frontend66│ ├── app67│ │ ├── api68│ │ │ └── subscribe69│ │ ├── dashboard70│ │ ├── errors71│ │ ├── logout72│ │ ├── preview73│ │ │ ├── dashboard74│ │ │ └── processing75│ │ ├── processing76│ ├── components77│ ├── config78│ ├── public79│ ├── styles80│ ├── tests81│ ├── types82│ └── utils838485# Complete repo contents (files-to-prompt output)8687target_repo/__init__.py88---899091---92target_repo/backend/__init__.py93---949596---97target_repo/backend/config.py98---99import json100101from pydantic import field_validator102from pydantic_settings import BaseSettings, SettingsConfigDict, NoDecode103from typing import List104from typing_extensions import Annotated105import logging106107logger = logging.getLogger(__name__)108109110class Settings(BaseSettings):111 GOOGLE_SCOPES: Annotated[List[str], NoDecode]112 REDIRECT_URI: str113 GOOGLE_CLIENT_ID: str114 GOOGLE_API_KEY: str115 COOKIE_SECRET: str116 CLIENT_SECRETS_FILE: str = "credentials.json"117 ENV: str = "dev"118 APP_URL: str119 ORIGIN: str = ".jobba.help"120 DATABASE_URL: str = "default-for-local"121 DATABASE_URL_LOCAL_VIRTUAL_ENV: str = (122 "postgresql://postgres:postgres@localhost:5433/jobseeker_analytics"123 )124 DATABASE_URL_DOCKER: str = (125 "postgresql://postgres:postgres@db:5432/jobseeker_analytics"126 )127128 @field_validator("GOOGLE_SCOPES", mode="before")129 @classmethod130 def decode_scopes(cls, v: str) -> List[str]:131 logger.info("Decoded scopes from string: %s", json.loads(v.strip("'\"")))132 return json.loads(v.strip("'\""))133134 @property135 def is_publicly_deployed(self) -> bool:136 return self.ENV in ["prod", "staging"]137138 model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="allow")139140141settings = Settings(_env_file=".env", _env_file_encoding="utf-8")142143144---145target_repo/backend/constants.py146---147"""148This file contains the main constants used in the application.149"""150151from datetime import datetime, timedelta152from pathlib import Path153from utils.filter_utils import (154 parse_base_filter_config,155) # , parse_override_filter_config156157158GENERIC_ATS_DOMAINS = [159 "us.greenhouse-mail.io",160 "smartrecruiters.com",161 "linkedin.com",162 "ashbyhq.com",163 "hire.lever.co",164 "hi.wellfound.com",165 "talent.icims.com",166 "myworkday.com",167 "otta.com",168]169170DEFAULT_DAYS_AGO = 30171# Get the current date172current_date = datetime.now()173174# Subtract 30 days175date_days_ago = current_date - timedelta(days=DEFAULT_DAYS_AGO)176177# Format the date in the required format (YYYY/MM/DD)178formatted_date = date_days_ago.strftime("%Y/%m/%d")179180APPLIED_FILTER_PATH = (181 Path(__file__).parent / "email_query_filters" / "applied_email_filter.yaml"182)183APPLIED_FILTER_OVERRIDES_PATH = (184 Path(__file__).parent185 / "email_query_filters"186 / "applied_email_filter_overrides.yaml"187)188QUERY_APPLIED_EMAIL_FILTER = (189 f"after:{formatted_date} AND ({parse_base_filter_config(APPLIED_FILTER_PATH)})"190)191192# ------ implement override filter later!! #193# OR \n"194# f"{parse_override_filter_config(APPLIED_FILTER_OVERRIDES_PATH)})"195# )196# label:jobs -label:query4197198---199target_repo/backend/database.py200---201import os202from typing import Annotated203from sqlmodel import SQLModel, create_engine, Session204from utils.config_utils import get_settings205from sqlalchemy.ext.declarative import declarative_base206from sqlalchemy.orm import sessionmaker207import fastapi208209210def create_db_and_tables():211 SQLModel.metadata.create_all(engine)212213def get_session():214 return Session(engine)215216217def request_session():218 session = get_session()219220 with session.begin():221 yield session222223224DBSession = Annotated[Session, fastapi.Depends(request_session)]225226settings = get_settings()227IS_DOCKER_CONTAINER = os.environ.get("IS_DOCKER_CONTAINER", 0)228if IS_DOCKER_CONTAINER:229 DATABASE_URL = settings.DATABASE_URL_DOCKER230elif settings.is_publicly_deployed:231 DATABASE_URL = settings.DATABASE_URL232else:233 DATABASE_URL = settings.DATABASE_URL_LOCAL_VIRTUAL_ENV234235engine = create_engine(DATABASE_URL)236SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)237Base = declarative_base()238239---240target_repo/backend/main.py241---242import logging243244from fastapi import FastAPI, HTTPException, Request, Depends245from fastapi.responses import HTMLResponse 246from fastapi.staticfiles import StaticFiles247from fastapi.templating import Jinja2Templates248from starlette.middleware.sessions import SessionMiddleware249from fastapi.middleware.cors import CORSMiddleware250from slowapi import Limiter251from slowapi.util import get_remote_address252from slowapi.errors import RateLimitExceeded253from slowapi.middleware import SlowAPIMiddleware254from db.users import UserData255from db.utils.user_utils import add_user256from utils.config_utils import get_settings257from session.session_layer import validate_session258from contextlib import asynccontextmanager259from database import create_db_and_tables260261# Import routes262from routes import email_routes, auth_routes, file_routes, users_routes, start_date_routes263264@asynccontextmanager265async def lifespan(app: FastAPI):266 create_db_and_tables()267 yield268269app = FastAPI(lifespan=lifespan)270settings = get_settings()271APP_URL = settings.APP_URL272app.add_middleware(SessionMiddleware, secret_key=settings.COOKIE_SECRET)273app.mount("/static", StaticFiles(directory="static"), name="static")274275# Register routes276app.include_router(auth_routes.router)277app.include_router(email_routes.router)278app.include_router(file_routes.router)279app.include_router(users_routes.router)280app.include_router(start_date_routes.router)281282limiter = Limiter(key_func=get_remote_address)283app.state.limiter = limiter # Ensure limiter is assigned284285# Configure CORS286if settings.is_publicly_deployed:287 # Production CORS settings288 origins = ["https://www.jobba.help", "https://www.staging.jobba.help", 289 "https://www.app.justajobapp.com", "https://www.api.justajobapp.com"]290else:291 # Development CORS settings292 origins = [293 "http://localhost:3000", # Assuming frontend runs on port 3000294 "http://127.0.0.1:3000",295 ]296297# Add SlowAPI middleware for rate limiting298app.add_middleware(SlowAPIMiddleware)299300# Add CORS middleware301app.add_middleware(302 CORSMiddleware,303 allow_origins=origins, # Allow frontend origins304 allow_credentials=True,305 allow_methods=["*"], # Allow all HTTP methods (GET, POST, etc.)306 allow_headers=["*"], # Allow all headers307)308309app.add_middleware(310 CORSMiddleware,311 allow_origins=origins, # Allow frontend origins312 allow_credentials=True,313 allow_methods=["*"], # Allow all HTTP methods (GET, POST, etc.)314 allow_headers=["*"], # Allow all headers315)316317# Set up Jinja2 templates318templates = Jinja2Templates(directory="templates")319320logger = logging.getLogger(__name__)321logging.basicConfig(level=logging.DEBUG, format="%(levelname)s - %(message)s")322323324# Rate limit exception handler325@app.exception_handler(RateLimitExceeded)326async def rate_limit_exceeded_handler(request: Request, exc: RateLimitExceeded):327 raise HTTPException(328 status_code=429,329 detail="Too many requests. Please try again later.",330 )331332333@app.post("/api/add-user")334@limiter.limit("3/minute")335async def add_user_endpoint(user_data: UserData, request: Request, user_id: str = Depends(validate_session)):336 """337 This endpoint adds a user to the database and session storage338 """339 try:340 add_user(user_data, request)341 return {"message": "User added successfully"}342 except Exception as e:343 # Log the error for debugging purposes344 logger.error(f"An error occurred while adding user: {e}")345 return {"error": "An error occurred while adding the user."}346347348@app.get("/")349async def root(request: Request, response_class=HTMLResponse):350 return templates.TemplateResponse("homepage.html", {"request": request})351352# Run the app using Uvicorn353if __name__ == "__main__":354 import uvicorn355356 uvicorn.run(app, host="0.0.0.0", port=8000)357358---359target_repo/backend/start_date/storage.py360---361"""362This file contains the main constants used in the application.363"""364from pathlib import Path365from utils.filter_utils import (366 parse_base_filter_config,367)368from constants import QUERY_APPLIED_EMAIL_FILTER369370APPLIED_FILTER_PATH = (371 Path(__file__).parent.parent / "email_query_filters" / "applied_email_filter.yaml"372)373374def get_start_date_email_filter(start_date: str) -> str:375 if not start_date:376 return QUERY_APPLIED_EMAIL_FILTER377378 START_DATE_EMAIL_FILTER = (379 f"after:{start_date} AND ({parse_base_filter_config(APPLIED_FILTER_PATH)})"380 )381 return START_DATE_EMAIL_FILTER382383---384target_repo/backend/tests/__init__.py385---386387388---389target_repo/backend/tests/conftest.py390---391import sys392import os393394import pytest395from testcontainers.postgres import PostgresContainer396import sqlalchemy as sa397from sqlalchemy.orm import Session398from sqlmodel import SQLModel399400# Add the parent directory to sys.path401sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))402os.chdir("./backend")403404import database # noqa: E402405406407@pytest.fixture(scope="session")408def postgres_container():409 with PostgresContainer("postgres:13") as postgres:410 yield postgres411412413@pytest.fixture414def engine(postgres_container: PostgresContainer, monkeypatch):415 test_engine = sa.create_engine(416 sa.URL.create(417 "postgresql",418 username=postgres_container.username,419 password=postgres_container.password,420 host=postgres_container.get_container_host_ip(),421 port=postgres_container.get_exposed_port(postgres_container.port),422 database=postgres_container.dbname,423 )424 )425426 monkeypatch.setattr(database, "engine", test_engine)427428 database.create_db_and_tables()429430 yield test_engine431432 with test_engine.begin() as transaction:433 transaction.execute(434 sa.text("SET session_replication_role = :role"), {"role": "replica"}435 )436 for table in SQLModel.metadata.tables.values():437 transaction.execute(table.delete())438439440@pytest.fixture441def db_session(engine, monkeypatch):442 with Session(database.engine) as session:443 yield session444445446---447target_repo/backend/tests/test_config_utils.py448---449from unittest.mock import patch450from utils.config_utils import get_settings451from config import Settings452import pytest453import json454import os455456457@pytest.fixture(scope="session", autouse=True)458def setup_static_directory():459 static_dir = os.path.join(os.path.dirname(__file__), "../static")460 if not os.path.exists(static_dir):461 os.makedirs(static_dir)462463464@patch("utils.config_utils.config.Settings")465def test_get_settings_only_called_once_with_lru(mock_settings_call):466 get_settings.cache_clear()467 get_settings()468 get_settings()469 # Ensure the Settings constructor is called only once due to lru_cache470 mock_settings_call.assert_called_once()471 get_settings.cache_clear()472473474def test_import_settings_does_not_raise_error():475 import backend.utils.llm_utils # noqa: F401476 import backend.utils.auth_utils # noqa: F401477478479def test_decode_scopes_valid_json():480 input_str = '["https://www.googleapis.com/auth/gmail.readonly", "openid"]'481 expected_output = ["https://www.googleapis.com/auth/gmail.readonly", "openid"]482 assert Settings.decode_scopes(input_str) == expected_output483484485def test_decode_scopes_with_extra_quotes():486 input_str = '\'["https://www.googleapis.com/auth/gmail.readonly", "openid"]\''487 expected_output = ["https://www.googleapis.com/auth/gmail.readonly", "openid"]488 assert Settings.decode_scopes(input_str) == expected_output489490491def test_decode_scopes_invalid_json():492 input_str = '["https://www.googleapis.com/auth/gmail.readonly", "openid"'493 with pytest.raises(json.JSONDecodeError):494 Settings.decode_scopes(input_str)495496497def test_decode_scopes_empty_string():498 input_str = ""499 with pytest.raises(json.JSONDecodeError):500 Settings.decode_scopes(input_str)501502503def test_prod_is_publicly_deployed_true():504 settings = Settings(ENV="prod")505 assert settings.is_publicly_deployed506507508def test_dev_is_publicly_deployed_false():509 settings = Settings(ENV="dev")510 assert not settings.is_publicly_deployed511512513def test_staging_is_publicly_deployed_true():514 settings = Settings(ENV="staging")515 assert settings.is_publicly_deployed516517518---519target_repo/backend/tests/test_constants.py520---521from pathlib import Path522523SUBJECT_LINE = "Invitation from an unknown sender: Interview with TestCompanyName @ Thu May 2, 2024 11:00am - 12pm (PDT) ([email protected])"524SAMPLE_MESSAGE = {525 "id": "abc123",526 "threadId": "abc123",527 "labelIds": ["IMPORTANT", "CATEGORY_PERSONAL", "Label_1"],528 "snippet": "Interview with TestCompanyName Unknown sender This event from [email protected] won't appear in your calendar unless you say you know the sender. Know this sender? When Thursday May 9, 2024 ⋅ 02:40pm –",529 "payload": {530 "partId": "",531 "mimeType": "multipart/mixed",532 "filename": "",533 "headers": [534 {"name": "Delivered-To", "value": "[email protected]"},535 {536 "name": "Received",537 "value": "by 2024:abc:6000:2000:b0:200:1000:5000 with SMTP id cub; Thu, 2 May 2024 16:45:00 -0700 (PDT)",538 },539 {540 "name": "X-Received",541 "value": "by 2024:abc:6000:2000:b0:200:1000:5000 with SMTP id def567-890jkl.9.000000000000; Thu, 2 May 2024 16:45:00 -0700 (PDT)",542 },543 {544 "name": "ARC-Seal",545 "value": "redacted-ARC-value",546 },547 {548 "name": "ARC-Message-Signature",549 "value": "i=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com; s=arc-00000000; h=to:from:subject:date:message-id:sender:reply-to:mime-version :dkim-signature:dkim-signature; bh=pqr123; fh=AZ123/PST=; b=GAH",550 },551 {552 "name": "ARC-Authentication-Results",553 "value": "i=1; mx.google.com; dkim=pass [email protected] header.s=10101101 header.b=WOOHOO; dkim=pass [email protected] header.s=google header.b=di8r; spf=pass (google.com: domain of [email protected] designates 000.00.000.00 as permitted sender) [email protected]; dmarc=pass (p=NONE sp=NONE dis=NONE) header.from=testcompanyname.com",554 },555 {"name": "Return-Path", "value": "<[email protected]>"},556 {557 "name": "Received",558 "value": "from mail-fff-a00.google.com (mail-fff-a00.google.com. [000.00.000.00]) by mx.google.com with SMTPS id def567-890mno.0.2024.05.02.16.45.00 for <[email protected]> (Google Transport Security); Thu, 2 May 2024 16:45:00 -0700 (PDT)",559 },560 {561 "name": "Received-SPF",562 "value": "pass (google.com: domain of [email protected] designates 000.00.000.00 as permitted sender) client-ip=000.00.000.00;",563 },564 {565 "name": "Authentication-Results",566 "value": "mx.google.com; dkim=pass [email protected] header.s=10101101 header.b=WOOHOO; dkim=pass [email protected] header.s=google header.b=di8r; spf=pass (google.com: domain of [email protected] designates 000.00.000.00 as permitted sender) [email protected]; dmarc=pass (p=NONE sp=NONE dis=NONE) header.from=testcompanyname.com",567 },568 {569 "name": "DKIM-Signature",570 "value": "v=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com; s=10101101; t=1111111111; x=1111111111; dara=google.com; h=to:from:subject:date:message-id:sender:reply-to:mime-version:from :to:cc:subject:date:message-id:reply-to; bh=pqr123; b=GAH",571 },572 {573 "name": "DKIM-Signature",574 "value": "v=1; a=rsa-sha256; c=relaxed/relaxed; d=testcompanyname.com; s=google; t=1111111111; x=1111111111; dara=google.com; h=to:from:subject:date:message-id:sender:reply-to:mime-version:from :to:cc:subject:date:message-id:reply-to; bh=pqr123; b=GAH",575 },576 {577 "name": "X-Google-DKIM-Signature",578 "value": "v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=10101101; t=1111111111; x=1111111111; h=to:from:subject:date:message-id:sender:reply-to:mime-version :x-gm-message-state:from:to:cc:subject:date:message-id:reply-to; bh=pqr123; b=BLAH",579 },580 {581 "name": "X-Gm-Message-State",582 "value": "AGH",583 },584 {585 "name": "X-Google-Smtp-Source",586 "value": "AGH",587 },588 {"name": "MIME-Version", "value": "1.0"},589 {590 "name": "X-Received",591 "value": "by 2222:abc:600:2000:d0:777:9000:4000 with SMTP id def567-890ghi.10.1111111111566; Thu, 2 May 2024 16:45:00 -0700 (PDT)",592 },593 {594 "name": "Reply-To",595 "value": "Recruiter Name <[email protected]>",596 },597 {598 "name": "Sender",599 "value": "Google Calendar <[email protected]>",600 },601 {602 "name": "Message-ID",603 "value": "<[email protected]>",604 },605 {"name": "Date", "value": "Thu, 2 May 2024 16:45:00 +0000"},606 {607 "name": "Subject",608 "value": "Invitation from an unknown sender: Interview with TestCompanyName @ Thu May 2, 2024 11:00am - 12pm (PDT) ([email protected])",609 },610 {611 "name": "From",612 "value": "Recruiter Name <[email protected]>",613 },614 {"name": "To", "value": "[email protected]"},615 {616 "name": "Content-Type",617 "value": 'multipart/mixed; boundary="000000000000"',618 },619 ],620 "body": {"size": 0},621 "parts": [622 {623 "partId": "0",624 "mimeType": "multipart/alternative",625 "filename": "",626 "headers": [627 {628 "name": "Content-Type",629 "value": 'multipart/alternative; boundary="000000000000"',630 }631 ],632 "body": {"size": 0},633 "parts": [634 {635 "partId": "0.0",636 "mimeType": "text/plain",637 "filename": "",638 "headers": [639 {640 "name": "Content-Type",641 "value": 'text/plain; charset="UTF-8"; format=flowed; delsp=yes',642 },643 {"name": "Content-Transfer-Encoding", "value": "base64"},644 ],645 "body": {646 "size": 2000,647 "data": "abc",648 },649 },650 {651 "partId": "0.1",652 "mimeType": "text/html",653 "filename": "",654 "headers": [655 {656 "name": "Content-Type",657 "value": 'text/html; charset="UTF-8"',658 },659 {660 "name": "Content-Transfer-Encoding",661 "value": "quoted-printable",662 },663 ],664 "body": {665 "size": 30000,666 "data": "abc",667 },668 },669 {670 "partId": "0.2",671 "mimeType": "text/calendar",672 "filename": "invite.ics",673 "headers": [674 {675 "name": "Content-Type",676 "value": 'text/calendar; charset="UTF-8"; method=REQUEST',677 },678 {"name": "Content-Transfer-Encoding", "value": "7bit"},679 ],680 "body": {681 "attachmentId": "",682 "size": 1000,683 },684 },685 ],686 },687 {688 "partId": "1",689 "mimeType": "application/ics",690 "filename": "invite.ics",691 "headers": [692 {693 "name": "Content-Type",694 "value": 'application/ics; name="invite.ics"',695 },696 {697 "name": "Content-Disposition",698 "value": 'attachment; filename="invite.ics"',699 },700 {"name": "Content-Transfer-Encoding", "value": "base64"},701 ],702 "body": {703 "attachmentId": "",704 "size": 1000,705 },706 },707 ],708 },709 "sizeEstimate": 33333,710 "historyId": "22222222",711 "internalDate": "1111111111000",712}713714DESIRED_PASS_APPLIED_EMAIL_FILTER_SUBJECT = [715 "Thank you for your Application!",716 "Jobba, your application was sent to The Huts",717 "Your Interview with",718 "Thank you for your job application"719]720721DESIRED_FAIL_APPLIED_EMAIL_FILTER_FROM = [722 "[email protected]", # made up, would be better to capture the real example723 "[email protected]",724 "[email protected]",725 "[email protected]",726 "[email protected]",727 "[email protected]",728 "[email protected]",729 "[email protected]"730]731732DESIRED_FAIL_APPLIED_EMAIL_FILTER_SUBJECT = [733 "Apply to",734 "Apply now",735 "New job",736 "Job Search Council Matching - Next Steps"737]738739DESIRED_PASS_APPLIED_EMAIL_FILTER_FROM = ["[email protected]", "myworkday.com"]740741SAMPLE_FILTER_PATH = Path(__file__).parent / "sample_base_filter.yaml"742EXPECTED_SAMPLE_QUERY_STRING = """(subject:"application has been submitted" 743 OR (subject:"application to" AND subject:"successfully submitted") 744 OR from:"[email protected]" 745 AND -from:"[email protected]" 746 AND -subject:"watering")"""747748749---750target_repo/backend/tests/test_email_utils.py751---752from unittest import mock753import pytest754755from tests.test_constants import SAMPLE_MESSAGE, SUBJECT_LINE756import utils.email_utils as email_utils757import db.utils.user_email_utils as user_email_utils758759def test_get_top_consecutive_capitalized_words():760 test_cases = {761 (762 ("Hello", 10), # capitalized, highest frequency, prioritize763 ("World", 8), # capitalized, lower frequency, ignore764 ): "Hello",765 (766 ("Hello", 10), # capitalized, highest frequency, prioritize767 ("World", 10), # capitalized, highest frequency, add to result768 ("How", 5), # capitalized, lower frequency, ignore769 ): "Hello World",770 (771 ("hello", 5), # not capitalized, highest frequency, ignore772 ("World", 5), # capitalized, highest frequency, prioritize773 ("How", 5), # capitalized, highest frequency, add to result774 ("are", 5), # not capitalized, highest frequency, ignore775 ): "World How",776 (777 ("hello", 5), # not capitalized, highest frequency, ignore778 ("world", 5), # capitalized, highest frequency, prioritize779 ("how", 5), # capitalized, highest frequency, add to result780 ("are", 5), # not capitalized, highest frequency, ignore781 ): "", # no consecutive capitalized words782 }783 for word_list, expected_value in test_cases.items():784 result = email_utils.get_top_consecutive_capitalized_words(word_list)785 assert result == expected_value786787788def test_is_valid_email():789 email_test_cases = {790 "[email protected]": True,791 "[email protected]": False, # Invalid domain792 "no-reply.com": False, # Missing @793 }794 for email, expected_value in email_test_cases.items():795 is_valid = email_utils.is_valid_email(email)796 assert is_valid == expected_value, "email: %s" % email797798799def test_is_email_automated():800 email_test_cases = {801 "[email protected]": True,802 "[email protected]": True,803 "[email protected]": True,804 "[email protected]": True,805 "[email protected]": True,806 "[email protected]": False,807 }808 for email, expected_value in email_test_cases.items():809 is_automated = email_utils.is_automated_email(email)810 assert is_automated == expected_value, "email: %s" % email811812813def test_get_email_subject_line():814 subject_line = email_utils.get_email_subject_line(SAMPLE_MESSAGE)815 assert (816 subject_line817 == "Invitation from an unknown sender: Interview with TestCompanyName @ Thu May 2, 2024 11:00am - 12pm (PDT) ([email protected])"818 )819820821def test_get_email_from_address():822 from_address = email_utils.get_email_from_address(SAMPLE_MESSAGE)823 assert from_address == "[email protected]"824825826def test_get_email_domain():827 from_email_domain = email_utils.get_email_domain_from_address(828 "[email protected]"829 )830 assert from_email_domain == "testcompanyname.com"831832833def test_is_generic_email_domain():834 assert email_utils.is_generic_email_domain("hire.lever.co")835 assert email_utils.is_generic_email_domain("us.greenhouse-mail.io")836837838def test_get_last_capitalized_words_in_line():839 last_capitalized_words = email_utils.get_last_capitalized_words_in_line(840 "Thank you for your application to CompanyName"841 )842 assert last_capitalized_words == "CompanyName"843844845def test_get_company_name_returns_email_domain():846 company_name = email_utils.get_company_name(847 id="abc123", msg=SAMPLE_MESSAGE, subject_line=SUBJECT_LINE848 )849 assert company_name == "testcompanyname"850851852def test_get_company_name_returns_top_word():853 """Default behavior for company name is to return the854 highest frequency word that appears in the email body."""855 with mock.patch(856 "utils.email_utils.get_top_word_in_email_body", return_value="FakeCompany"857 ):858 company_name = email_utils.get_company_name(859 id="abc123", msg=SAMPLE_MESSAGE, subject_line=SUBJECT_LINE860 )861 assert company_name == "FakeCompany"862863864def test_get_company_name_returns_last_capital_word_in_subject_line():865 """Default behavior for company name is to return the866 highest frequency word that appears in the email body."""867 with (868 mock.patch(869 "utils.email_utils.get_top_word_in_email_body", return_value="interview"870 ),871 mock.patch(872 "utils.email_utils.get_email_from_address",873 return_value="[email protected]",874 ),875 ):876 company_name = email_utils.get_company_name(877 id="abc123",878 msg=SAMPLE_MESSAGE,879 subject_line="Thanks for interviewing with CoolCompany",880 )881 assert company_name == "CoolCompany"882883884def test_get_email_received_at_timestamp():885 received_at = email_utils.get_received_at_timestamp(1, SAMPLE_MESSAGE)886 assert received_at == "Thu, 2 May 2024 16:45:00 +0000"887888889@pytest.fixture890def mock_user():891 user = mock.MagicMock()892 user.user_id = "test_user_123"893 return user894895896@pytest.fixture897def message_data_with_list_values():898 """Message data where received_at is a list instead of a string"""899 return {900 "id": "19501385930c533f",901 "company_name": "",902 "application_status": "",903 "received_at": "Thu, 13 Feb 2025 21:30:24 +0000 (UTC)",904 "subject": "Message replied: Are you looking for Remote opportunities?",905 "job_title": "",906 "from": "Tester Recruiter <[email protected]>"907 }908909910@mock.patch('db.utils.user_email_utils.check_email_exists')911def test_create_user_email_with_list_values(mock_check_email, mock_user, message_data_with_list_values, caplog):912 """Test that create_user_email handles message_data_with_list_values correctly"""913 mock_check_email.return_value = False914 result = user_email_utils.create_user_email(mock_user, message_data_with_list_values)915 assert result is not None # user email created successfully916917918---919target_repo/backend/tests/test_filter_schema.py920---921"""922these tests are intended to verify that the changes made to filter yamls will yield the923desired results. Note that these tests DO NOT make any checks against functions in924filter_utils. If you make changes there, the correct tests are found in test_filter_utils.925926tests for override filters have not yet been implemented927"""928929import pytest930from pathlib import Path931import yaml932from typing import List, Dict, Union933import re934from constants import APPLIED_FILTER_PATH # , APPLIED_FILTER_OVERRIDES_PATH935from tests.test_constants import (936 DESIRED_FAIL_APPLIED_EMAIL_FILTER_SUBJECT,937 DESIRED_PASS_APPLIED_EMAIL_FILTER_FROM,938 SAMPLE_FILTER_PATH,939)940941FilterConfigType = List[Dict[str, Union[str, int, bool, list, dict]]]942943FILTER_CONFIG_DIR = Path(__file__).parent.parent / "email_query_filters"944945946def get_base_filter_config_paths() -> List[Path]:947 return [SAMPLE_FILTER_PATH] + [948 x for x in FILTER_CONFIG_DIR.iterdir() if "override" not in str(x)949 ]950951952def get_override_filter_config_paths() -> List[Path]:953 return [x for x in FILTER_CONFIG_DIR.iterdir() if "override" in str(x)]954955956def load_filter_config(filter_path: str) -> FilterConfigType:957 with open(filter_path, "r") as fid:958 filter_config = yaml.safe_load(fid)959 return filter_config960961962def validate_schema_block_order(filter_config: FilterConfigType) -> bool:963 """964 Validates that 'exclude' blocks appear after 'include' blocks in the schema.965 """966967 include_seen = False968 for block in filter_config:969 how = block.get("how")970 if how == "include":971 include_seen = True972 elif how == "exclude" and not include_seen:973 return False # Exclude block before any include block974975 return True976977978@pytest.mark.parametrize(979 "filter_config", [load_filter_config(x) for x in get_base_filter_config_paths()]980)981def test_base_filter_yaml_schema(filter_config):982 logic_list = [block["logic"] for block in filter_config if block["logic"]]983 how_list = [block["how"] for block in filter_config]984 exclude_terms = sum(985 [block["terms"] for block in filter_config if block["how"] == "exclude"], []986 )987988 assert all(989 [990 (x == "any" and y == "include") or (x == "all" and y == "exclude")991 for x, y in zip(logic_list, how_list)992 ]993 ), "logic=any is not allowed for how=exclude"994 assert all(["*" not in x for x in exclude_terms]), (995 "wildcard is not allowed in exclude blocks"996 )997 assert validate_schema_block_order(filter_config), (998 "Exclude block found before an include block"999 )100010011002def apply_base_filter(field_text, field_name, filter_config) -> bool:1003 """Applies the YAML filter to the given text."""10041005 ret_val = False # Default to failing if no filter logic is defined.10061007 for block in filter_config:1008 if block["field"] == field_name:1009 # check if the text is in the any, include block for that field1010 if block["logic"] == "any" and block["how"] == "include":1011 # simple compare1012 if not ret_val:1013 ret_val = any(1014 [1015 x.lower() in field_text.lower()1016 for x in block["terms"]1017 if "*" not in x1018 ]1019 )10201021 # use regex for wildcard compare1022 if not ret_val:1023 ret_val = any(1024 [1025 re.findall(1026 x.replace(" * ", ".*").lower(), field_text.lower()1027 )1028 for x in block["terms"]1029 if "*" in x1030 ]1031 )10321033 # check if the text is in the all, exclude block for that field.1034 # all, exclude logic will override any matching includes1035 if ret_val:1036 if block["logic"] == "all" and block["how"] == "exclude":1037 ret_val = all(1038 [x.lower() not in field_text.lower() for x in block["terms"]]1039 )10401041 return ret_val104210431044@pytest.mark.parametrize(1045 "test_constant,filter_config",1046 [(DESIRED_FAIL_APPLIED_EMAIL_FILTER_SUBJECT, APPLIED_FILTER_PATH)],1047)1048def test_apply_email_filter_subject_fail(test_constant, filter_config):1049 """1050 Tests if the desired subject pairs in test_constants will fail the filter1051 """1052 filter_config = load_filter_config(APPLIED_FILTER_PATH)10531054 result_list = []1055 for subject_text in test_constant:1056 result = apply_base_filter(subject_text, "subject", filter_config)1057 result_list.append(result)10581059 assert not any(result_list), (1060 f"These subject pairs failed to fail: {[x for x, y in list(zip(test_constant, result_list)) if y]}"1061 )106210631064@pytest.mark.parametrize(1065 "test_constant,filter_config",1066 [(DESIRED_PASS_APPLIED_EMAIL_FILTER_FROM, APPLIED_FILTER_PATH)],1067)1068def test_apply_email_filter_from_pass(test_constant, filter_config):1069 """1070 Tests if the desired from pairs in test_constants will pass the filter1071 """1072 filter_config = load_filter_config(APPLIED_FILTER_PATH)10731074 result_list = []1075 for from_text in test_constant:1076 result = apply_base_filter(from_text, "from", filter_config)1077 result_list.append(result)10781079 assert all(result_list), (1080 f"These from pairs failed to pass: {[x for x, y in list(zip(test_constant, result_list)) if not y]}"1081 )108210831084@pytest.mark.parametrize(1085 "test_constant,filter_config",1086 [(DESIRED_FAIL_APPLIED_EMAIL_FILTER_SUBJECT, APPLIED_FILTER_PATH)],1087)1088def test_apply_email_filter_from_fail(test_constant, filter_config):1089 """1090 Tests if the desired from pairs in test_constants will fail the filter1091 """1092 filter_config = load_filter_config(APPLIED_FILTER_PATH)10931094 result_list = []1095 for from_text in test_constant:1096 result = apply_base_filter(from_text, "from", filter_config)1097 result_list.append(result)10981099 assert not any(result_list), (1100 f"These from pairs failed to fail: {[x for x, y in list(zip(test_constant, result_list)) if y]}"1101 )110211031104---1105target_repo/backend/tests/test_filter_utils.py1106---1107"""1108test that the strings produced by filter utils match expectations11091110tests for override filters have not yet been implemented.1111"""11121113from typing import List, Dict, Union11141115from utils.filter_utils import (1116 parse_base_filter_config,1117) # , parse_override_filter_config1118from tests.test_constants import SAMPLE_FILTER_PATH, EXPECTED_SAMPLE_QUERY_STRING11191120FilterConfigType = List[Dict[str, Union[str, int, bool, list, dict]]]112111221123def test_parse_filter_config_against_sample_filter(1124 filter_path=SAMPLE_FILTER_PATH, expected_query_string=EXPECTED_SAMPLE_QUERY_STRING1125):1126 result_str = parse_base_filter_config(filter_path)11271128 # remove white space from expected string for the purpose of comparing1129 expected_query_string = (1130 expected_query_string.replace("\n", "").replace("\t", "").replace(" ", "")1131 )11321133 assert result_str == expected_query_string, (1134 "result query string doesn't match expected query string"1135 )113611371138---1139target_repo/backend/tests/routes/__init__.py1140---114111421143---1144target_repo/backend/tests/routes/conftest.py1145---1146from datetime import datetime, timedelta1147from unittest import mock11481149import pytest1150from fastapi.testclient import TestClient11511152from db.users import Users1153import database1154import main115511561157@pytest.fixture1158def client(db_session):1159 main.app.dependency_overrides[database.request_session] = lambda: db_session1160 test_client = TestClient(main.app)11611162 return test_client116311641165@pytest.fixture1166def logged_in_user(db_session, client):1167 # create user1168 user = Users(1169 user_id="123",1170 user_email="[email protected]",1171 start_date=datetime(2000, 1, 1),1172 )1173 db_session.add(user)1174 db_session.flush()11751176 # log in1177 mock_credentials = mock.Mock(1178 **{1179 "expiry": datetime.utcnow() + timedelta(seconds=10),1180 "token": "fake access token",1181 "to_json.return_value": {"foo": "bar"},1182 }1183 )1184 mock_decoded_token = {"sub": user.user_id, "email": user.user_email}1185 with (1186 mock.patch(1187 "routes.auth_routes.Flow",1188 **{"from_client_secrets_file.return_value.credentials": mock_credentials},1189 ),1190 mock.patch(1191 "utils.auth_utils.id_token",1192 **{"verify_oauth2_token.return_value": mock_decoded_token},1193 ),1194 ):1195 auth_resp = client.get("/login", params={"code": "abc"}, follow_redirects=False)1196 assert auth_resp.status_code == 3031197 assert auth_resp.headers["Location"] == "http://localhost:3000/dashboard"11981199 return user120012011202---1203target_repo/backend/tests/routes/test_email_routes.py1204---1205from utils import auth_utils1206from unittest import mock1207from datetime import datetime12081209from fastapi import Request1210from sqlalchemy.orm import Session1211from google.oauth2.credentials import Credentials12121213from db.users import Users1214from db.processing_tasks import TaskRuns, FINISHED, STARTED1215from routes.email_routes import fetch_emails_to_db121612171218def test_processing(db_session, client, logged_in_user):1219 db_session.add(TaskRuns(user=logged_in_user, status=STARTED))1220 db_session.flush()12211222 # make request to check on processing status1223 resp = client.get("/processing", follow_redirects=False)12241225 # assert response1226 assert resp.status_code == 200, resp.headers1227 assert resp.json()["processed_emails"] == 0122812291230def test_processing_404(db_session, client, logged_in_user):1231 resp = client.get("/processing", follow_redirects=False)1232 assert resp.status_code == 404123312341235def test_fetch_emails_to_db(db_session: Session):1236 test_user_id = "123"12371238 db_session.add(1239 Users(1240 user_id=test_user_id,1241 user_email="[email protected]",1242 start_date=datetime(2000, 1, 1),1243 )1244 )1245 db_session.commit()12461247 with mock.patch("routes.email_routes.get_email_ids"):1248 fetch_emails_to_db(1249 auth_utils.AuthenticatedUser(Credentials("abc")),1250 Request({"type": "http", "session": {}}),1251 user_id=test_user_id,1252 )12531254 task_run = db_session.get(TaskRuns, test_user_id)1255 assert task_run.status == FINISHED125612571258def test_fetch_emails_to_db_in_progress_rate_limited_no_processing(db_session: Session):1259 test_user_id = "123"12601261 user = Users(1262 user_id=test_user_id,1263 user_email="[email protected]",1264 start_date=datetime(2000, 1, 1),1265 )1266 db_session.add(user)1267 db_session.add(TaskRuns(user=user, status=STARTED))1268 db_session.commit()12691270 with mock.patch("routes.email_routes.get_email_ids") as mock_get_email_ids:1271 fetch_emails_to_db(1272 auth_utils.AuthenticatedUser(Credentials("abc")),1273 Request({"type": "http", "session": {}}),1274 user_id=test_user_id,1275 )12761277 mock_get_email_ids.assert_not_called()1278 task_run = db_session.get(TaskRuns, test_user_id)1279 assert task_run.status == STARTED128012811282---1283target_repo/backend/utils/auth_utils.py1284---1285import logging1286import uuid12871288from utils.file_utils import get_user_filepath12891290from google.oauth2.credentials import Credentials1291from google.auth.transport.requests import Request1292from google.oauth2 import id_token12931294from utils.config_utils import get_settings12951296logger = logging.getLogger(__name__)12971298settings = get_settings()129913001301class AuthenticatedUser:1302 """1303 The AuthenticatedUser class is used to1304 store information about the user. This1305 class is instantiated after the user has1306 successfully authenticated with Google.1307 """13081309 def __init__(self, creds: Credentials, start_date=None):1310 self.creds = creds1311 self.user_id, self.user_email = self.get_user_id_and_email()1312 self.filepath = get_user_filepath(self.user_id)1313 self.start_date = start_date13141315 def get_user_id_and_email(self) -> tuple:1316 """1317 Retrieves the user ID and email from Google OAuth2 credentials.13181319 Parameters:13201321 Returns:1322 - user_id: The unique user ID.1323 - email: The user's email address.1324 """1325 try:1326 logger.info("Verifying ID token...")13271328 # Ensure we have an ID token1329 if not self.creds.id_token:1330 logger.warning("ID token is missing, trying to refresh credentials...")1331 self.creds.refresh(Request()) # Refresh credentials13321333 # If still missing, raise an error1334 if not self.creds.id_token:1335 raise ValueError("No ID token available after refresh.")1336 1337 decoded_token = id_token.verify_oauth2_token(1338 self.creds.id_token, Request(), audience=settings.GOOGLE_CLIENT_ID1339 )1340 user_id = decoded_token["sub"] # 'sub' is the unique user ID1341 user_email = decoded_token.get("email") # 'email' is the user's email address1342 return user_id, user_email1343 1344 except (KeyError, TypeError):1345 self.creds = self.creds.refresh(Request())1346 if not self.creds.id_token:1347 proxy_user_id = str(uuid.uuid4())1348 logger.error(1349 "Could not retrieve user ID. Using proxy ID: %s", proxy_user_id1350 )1351 return proxy_user_id, None # Generate a random ID and return None for email1352 if not hasattr(self, "_retry"):1353 self._retry = True1354 return self.get_user_id_and_email()1355 else:1356 proxy_user_id = str(uuid.uuid4())1357 logger.error(1358 "Could not retrieve user ID after retry. Using proxy ID: %s",1359 proxy_user_id,1360 )1361 return proxy_user_id, None # Generate a random ID and return None for email1362 except Exception as e:1363 logger.error("Error verifying ID token: %s", e)1364 proxy_user_id = str(uuid.uuid4())1365 logger.error("Could not verify ID token. Using proxy ID: %s", proxy_user_id)1366 return proxy_user_id, None # Generate a random ID and return None for email136713681369---1370target_repo/backend/utils/config_utils.py1371---1372from functools import lru_cache1373import config137413751376@lru_cache1377def get_settings():1378 return config.Settings()137913801381---1382target_repo/backend/utils/cookie_utils.py1383---1384from fastapi import Response1385from utils.config_utils import get_settings13861387settings = get_settings()138813891390def set_conditional_cookie(1391 response: Response,1392 key: str,1393 value: str,1394 max_age: int = 3600, # 1 hour1395 path: str = "/",1396 httponly: bool = True,1397):1398 """Helper function to set cookies with environment-appropriate settings"""1399 cookie_params = {1400 "key": key,1401 "value": value,1402 "max_age": max_age,1403 "path": path,1404 "httponly": httponly,1405 }14061407 # Add environment-specific parameters1408 if settings.is_publicly_deployed:1409 cookie_params.update(1410 {"domain": settings.ORIGIN, "secure": True, "samesite": "Strict"}1411 )1412 else:1413 cookie_params.update({"secure": False, "samesite": "Lax"})14141415 # Apply cookie prefixes for additional security1416 if cookie_params["secure"]:1417 if cookie_params["path"] == "/" and "domain" not in cookie_params:1418 cookie_params["key"] = f"__Host-{cookie_params['key']}"1419 else:1420 cookie_params["key"] = f"__Secure-{cookie_params['key']}"14211422 response.set_cookie(**cookie_params)1423 return response142414251426---1427target_repo/backend/utils/email_utils.py1428---1429import base641430import email1431import logging1432import re1433from typing import Dict, Any14341435from bs4 import BeautifulSoup1436from email_validator import validate_email, EmailNotValidError14371438from constants import GENERIC_ATS_DOMAINS14391440logger = logging.getLogger(__name__)144114421443def clean_whitespace(text: str) -> str:1444 """1445 remove \n, \r, and \t from strings1446 """1447 return text.replace("\n", "").replace("\r", "").replace("\t", "")144814491450def is_automated_email(email: str) -> bool:1451 """1452 Determines if an email address is automated or from a person.14531454 Parameters:1455 email (str): The email address to classify.14561457 Returns:1458 bool: True if automated, False otherwise.1459 """1460 # Define patterns for common automated prefixes and domains1461 automated_patterns = [1462 r"^no[-_.]?reply@", # Matches "no-reply", "no_reply", "noreply"1463 r"^do[-_.]?not[-_.]?reply@", # Matches "do-not-reply", "do_not_reply"1464 r"^notifications@", # Matches "notifications@"1465 r"^team@", # Matches "team@"1466 r"^hello@", # Matches "hello@" (often automated)1467 r"@smartrecruiters\.com$", # Matches specific automated domains1468 ]14691470 # Check against the patterns1471 for pattern in automated_patterns:1472 if re.search(pattern, email, re.IGNORECASE):1473 return True # It's an automated email14741475 return False # It's likely from a person147614771478def is_valid_email(email: str) -> bool:1479 try:1480 validate_email(email)1481 return True1482 except EmailNotValidError as e:1483 # email is not valid, exception message is human-readable1484 print(str(e))1485 return False148614871488def get_email_content(email_data: Dict[str, Any]) -> str:1489 """1490 parses html content of email data and appends it to text content and subject conent14911492 Note 1: linkedIn easy apply messages have *different* html and text_content, so we need to keep both1493 Note 2: some automated emails only contain the information about the company in the subject and1494 not the email body, so we need to append this to make sure the email processor gets to see it.14951496 """1497 text_content = email_data["subject"]14981499 if email_data["text_content"]:1500 text_content += "\n"1501 text_content += email_data["text_content"]15021503 if email_data["html_content"]:1504 soup = BeautifulSoup(email_data["html_content"], "html.parser")1505 html_content = soup.get_text(separator=" ", strip=True)15061507 text_content += "\n"1508 text_content += html_content15091510 return text_content151115121513def get_email(message_id: str, gmail_instance=None):1514 if gmail_instance:1515 try:1516 message = (1517 gmail_instance.users()1518 .messages()1519 .get(userId="me", id=message_id, format="raw")1520 .execute()1521 )1522 msg_str = base64.urlsafe_b64decode(message["raw"].encode("ASCII")).decode(1523 "utf-8"1524 )1525 mime_msg = email.message_from_string(msg_str)1526 # logger.info("mime_msg: %s", mime_msg)1527 # logger.info("msg_str: %s", msg_str)1528 email_data = {1529 "id": message_id,1530 "threadId": message.get("threadId", None),1531 "from": None,1532 "to": None,1533 "subject": None,1534 "date": None,1535 "text_content": None,1536 "html_content": None,1537 }15381539 # Getting email headers1540 email_data["from"] = clean_whitespace(mime_msg.get("From"))1541 email_data["to"] = clean_whitespace(mime_msg.get("To"))1542 email_data["subject"] = clean_whitespace(mime_msg.get("Subject"))1543 email_data["date"] = mime_msg.get("Date")15441545 # Extract body of the email1546 if mime_msg.is_multipart():1547 for part in mime_msg.walk():1548 content_type = part.get_content_type()1549 content_disposition = str(part.get("Content-Disposition"))1550 if (1551 content_type == "text/plain"1552 and "attachment" not in content_disposition1553 ):1554 email_data["text_content"] = part.get_payload(1555 decode=True1556 ).decode(encoding="utf-8", errors="ignore")1557 elif (1558 content_type == "text/html"1559 and "attachment" not in content_disposition1560 ):1561 email_data["html_content"] = part.get_payload(1562 decode=True1563 ).decode(encoding="utf-8", errors="ignore")1564 else:1565 content_type = mime_msg.get_content_type()1566 if content_type == "text/plain":1567 email_data["text_content"] = mime_msg.get_payload(1568 decode=True1569 ).decode(encoding="utf-8", errors="ignore")1570 elif content_type == "text/html":1571 email_data["html_content"] = mime_msg.get_payload(1572 decode=True1573 ).decode(encoding="utf-8", errors="ignore")15741575 email_data["raw_text_content"] = email_data["text_content"]1576 email_data["text_content"] = get_email_content(email_data)15771578 return email_data15791580 except Exception as e:1581 logger.exception(f"Error retrieving email with id {message_id}: {e}")1582 return {}1583 return {}158415851586def get_email_ids(query: tuple = None, gmail_instance=None):1587 email_ids = []1588 page_token = None15891590 while True:1591 response = (1592 gmail_instance.users()1593 .messages()1594 .list(1595 userId="me",1596 q=query,1597 includeSpamTrash=True,1598 pageToken=page_token,1599 )1600 .execute()1601 )16021603 if "messages" in response:1604 email_ids.extend(response["messages"])16051606 page_token = response.get("nextPageToken")1607 if not page_token:1608 break16091610 return email_ids161116121613def get_email_payload(msg):1614 return msg.get("payload", None)161516161617def get_email_headers(msg):1618 email_data = get_email_payload(msg)1619 if email_data:1620 return email_data.get("headers", None)1621 return None162216231624def get_email_parts(msg):1625 email_data = get_email_payload(msg)1626 if email_data:1627 return email_data.get("parts", None)1628 return None162916301631def get_email_subject_line(msg):1632 try:1633 email_headers = get_email_headers(msg)1634 if email_headers:1635 for header in email_headers:1636 key = header.get("name")1637 if key == "Subject":1638 return header.get("value", "")1639 except Exception as e:1640 logger.error("Error getting email subject line: %s", e)1641 return ""164216431644def get_last_capitalized_words_in_line(line):1645 try:1646 words = line.split()1647 last_capitalized_words = []1648 for word in reversed(words):1649 if word[0].isupper():1650 last_capitalized_words.append(word)1651 else:1652 break1653 return " ".join(reversed(last_capitalized_words))1654 except Exception as e:1655 logger.error("Error getting last capitalized words in email subject: %s", e)1656 return ""165716581659def get_email_from_address(msg):1660 try:1661 email_headers = get_email_headers(msg)1662 if email_headers:1663 for header in email_headers:1664 if header.get("name") == "From":1665 # if value enclosed in <> then extract email address1666 # else return the value as is1667 from_address = header.get("value")1668 if "<" in from_address:1669 return from_address.split("<")[1].split(">")[0]1670 return from_address1671 except Exception as e:1672 logger.error("Error getting email from address: %s", e)1673 return ""167416751676def get_received_at_timestamp(message_id, msg):1677 import datetime16781679 try:1680 email_headers = get_email_headers(msg)1681 if email_headers:1682 for header in email_headers:1683 key = header.get("name")1684 if key == "Date":1685 return header.get("value")1686 except Exception as e:1687 print("msg_%s: %s" % (message_id, e))1688 return datetime.datetime.now() # default if trouble parsing168916901691def is_generic_email_domain(domain):1692 # input expects return value of get_email_domain_from_address1693 return domain in GENERIC_ATS_DOMAINS169416951696def get_email_domain_from_address(email_address):1697 return email_address.split("@")[1] if "@" in email_address else ""169816991700def clean_email(email_body: str) -> list:1701 import spacy1702 from spacy_cleaner import processing, Cleaner17031704 try:1705 model = spacy.load("en_core_web_sm")1706 pipeline = Cleaner(1707 model,1708 processing.remove_stopword_token,1709 processing.remove_punctuation_token,1710 processing.remove_number_token,1711 )1712 return pipeline.clean([email_body])1713 except Exception as e:1714 logger.error("Error cleaning email: %s", e)1715 return []171617171718def get_word_frequency(cleaned_email):1719 try:1720 word_dict = {}1721 for word in cleaned_email[0].split(" "):1722 if word not in word_dict:1723 word_dict[word] = 11724 else:1725 word_dict[word] += 117261727 word_dict_sorted = sorted(1728 word_dict.items(), key=lambda item: item[1], reverse=True1729 )1730 return word_dict_sorted1731 except Exception as e:1732 logger.error("Error getting word frequency: %s", e)1733 return []173417351736def get_top_word_in_email_body(msg_id, msg):1737 try:1738 parts = get_email_parts(msg)1739 if parts:1740 for part in parts:1741 if part.get("mimeType") not in [1742 "text/plain",1743 "text/html",1744 ]:1745 continue1746 if part.get("mimeType") and part.get("mimeType") in [1747 "text/plain",1748 "text/html",1749 ]:1750 data = base64.urlsafe_b64decode(1751 part.get("body", {}).get("data", {})1752 ).decode("utf-8")1753 # Parse the content with BeautifulSoup1754 soup = BeautifulSoup(data, "html.parser")1755 # Extract the plain text from the HTML content1756 email_text = soup.get_text()1757 cleaned_text = clean_email(email_text)17581759 if cleaned_text:1760 word_frequency = get_word_frequency(cleaned_text)1761 top_capitalized_word = get_top_consecutive_capitalized_words(1762 word_frequency1763 )1764 if not top_capitalized_word:1765 if len(cleaned_text) > 0:1766 try:1767 return cleaned_text[0][0]1768 except IndexError:1769 return cleaned_text[0]1770 return top_capitalized_word1771 except Exception as e:1772 logger.error("Error getting top word: %s", e)1773 return ""177417751776def get_company_name(id, msg, subject_line):1777 try:1778 top_word = get_top_word_in_email_body(id, msg)1779 from_address = get_email_from_address(msg)1780 domain = get_email_domain_from_address(from_address)1781 if not top_word or top_word[0].islower():1782 # no top word, or top word is not capitalized1783 if is_generic_email_domain(domain):1784 # if generic ATS domain like workday, greenhouse, etc.,1785 # check the last capitalized word(s) in the subject line1786 return get_last_capitalized_words_in_line(subject_line) or ""1787 return domain.split(".")[0]1788 return top_word1789 except Exception as e:1790 logger.error("Error getting company name: %s", e)1791 return ""179217931794def get_top_consecutive_capitalized_words(tuples_list):1795 """1796 Helper function to parse company name from an email.1797 We only want the top capitalized words that appear consecutively and with the same frequency.1798 """1799 try:1800 result = []1801 temp_group = []1802 max = float("-inf")1803 for i, (first, second) in enumerate(tuples_list):1804 is_capitalized = first and first[0].isupper()18051806 if is_capitalized:1807 if not temp_group:1808 max = second1809 temp_group.append((first, second))1810 if temp_group and temp_group[-1][1] == second:1811 # Add to the current group if criteria match1812 temp_group.append((first, second))1813 if second < max:1814 break1815 result.append(first)1816 return " ".join(result)1817 except Exception as e:1818 logger.error("Error getting top consecutive capitalized words: %s", e)1819 return ""182018211822---1823target_repo/backend/utils/file_utils.py1824---1825def get_user_filepath(user_id: str) -> str:1826 """1827 Each user has their own directory to store their data.1828 """1829 return f"users/{user_id}"183018311832---1833target_repo/backend/utils/filter_utils.py1834---1835import yaml183618371838def parse_simple(term: str, field: str, exclude: bool = False) -> str:1839 """1840 Parses a simple combination of search field and search term into a gmail search string.1841 If exclude is true, a "-" character is prepended to the field.18421843 Args:1844 term (str): list of terms to parse1845 field (str): field to search1846 exclude (bool): whether to exclude the terms1847 """1848 if field == "body":1849 field_str = ""1850 else:1851 field_str = f"{field}:"18521853 if exclude:1854 out_str = f'-{field_str}"{term}"'1855 else:1856 out_str = f'{field_str}"{term}"'18571858 return out_str185918601861def parse_wildcard(term: str, field: str, exclude: bool = False) -> str:1862 """1863 The wildcard * is convenient to use in a yaml file, but it is1864 not supported by the Gmail API. This function will parse1865 any number of wildcards as ({field}: "{term1}" AND {field}: "{term2}" AND ...)18661867 If exclude is true, a "-" character is prepended to the field.18681869 Args:1870 term (str): list of terms to parse1871 field (str): field to search1872 exclude (bool): whether to exclude the terms1873 """1874 if field == "body":1875 field_str = ""1876 else:1877 field_str = f"{field}:"18781879 if exclude:1880 sub_terms = term.split(" * ")1881 out_str = "(" + " AND ".join([f'-{field_str}"{x}"' for x in sub_terms]) + ")"18821883 else:1884 sub_terms = term.split(" * ")1885 out_str = "(" + " AND ".join([f'{field_str}"{x}"' for x in sub_terms]) + ")"18861887 return out_str188818891890def parse_base_filter_config(filter_path: str) -> str:1891 with open(filter_path, "r") as fid:1892 data = yaml.safe_load(fid)18931894 filter_str = ""1895 for block in data:1896 sub_filter_str = ""1897 if block["logic"] == "any":1898 operator = " OR "1899 elif block["logic"] == "all":1900 operator = " AND "19011902 # parse each item based on schema logic1903 simple_filters = []1904 wildcard_any_filters = []1905 if block["how"] == "include":1906 simple_filters += [1907 parse_simple(x, block["field"], exclude=False)1908 for x in block["terms"]1909 if "*" not in x1910 ]1911 wildcard_any_filters += [1912 parse_wildcard(x, block["field"], exclude=False)1913 for x in block["terms"]1914 if "*" in x1915 ]1916 if block["how"] == "exclude":1917 simple_filters += [1918 parse_simple(x, block["field"], exclude=True) for x in block["terms"]1919 ]19201921 # join with appropriate operator1922 if simple_filters + wildcard_any_filters:1923 sub_filter_str = operator.join(simple_filters + wildcard_any_filters)19241925 # if this isn't the first item then we need to add an extra operator in from1926 if sub_filter_str:1927 if len(filter_str) > 0:1928 sub_filter_str = operator + sub_filter_str1929 filter_str += sub_filter_str19301931 filter_str = "(" + filter_str + ")"19321933 return filter_str193419351936def parse_override_filter_config(filter_path: str):1937 """not implemented"""1938 with open(filter_path, "r") as fid:1939 data = yaml.safe_load(fid)19401941 filter_str_list = []1942 for block in data:1943 simple_filters = []1944 for sub_block in block:1945 include_terms = sub_block["include_terms"]1946 exclude_terms = sub_block["exclude_terms"]19471948 # parse each item based on schema logic1949 if include_terms is not None:1950 simple_filters += [1951 parse_simple(x, sub_block["field"], exclude=False)1952 for x in sub_block["include_terms"]1953 ]1954 if exclude_terms is not None:1955 simple_filters += [1956 parse_simple(x, sub_block["field"], exclude=True)1957 for x in sub_block["exclude_terms"]1958 ]19591960 # join with an AND operator1961 if simple_filters:1962 filter_str_list.append("(" + " AND ".join(simple_filters) + ")")19631964 filter_str = "(" + " OR ".join(filter_str_list) + ")"19651966 return filter_str196719681969---1970target_repo/backend/utils/llm_utils.py1971---1972import google.generativeai as genai1973import time1974import json1975from google.ai.generativelanguage_v1beta2 import GenerateTextResponse1976import logging19771978from utils.config_utils import get_settings19791980settings = get_settings()19811982# Configure Google Gemini API1983genai.configure(api_key=settings.GOOGLE_API_KEY)1984model = genai.GenerativeModel("gemini-2.0-flash-lite")1985logger = logging.getLogger(__name__)1986logging.basicConfig(1987 level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"1988)19891990logging.basicConfig(1991 level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"1992)19931994def process_email(email_text):1995 prompt = f"""1996 Extract the company name, job application status, and job title (role) from the following email. 1997 1998 Given the content of an email related to job applications or recruitment, assign one of the following labels to job application status based on the main purpose or outcome of the message:1999 2000 Application confirmation2001 Rejection2002 Availability request2003 Information request2004 Assessment sent2005 Interview invitation2006 Did not apply - inbound request2007 Action required from company2008 Hiring freeze notification2009 Withdrew application2010 Offer made2011 False positive, not related to job search2012 Informational outreach20132014 Labeling Rules and Explanations for Job Application Status:20152016 Application confirmation2017 Assign this label if the email confirms receipt of a job application.2018 Examples: "We have received your application", "Thank you for applying", "Your application has been submitted".20192020 Rejection2021 Use this label for emails explicitly stating that the candidate is not moving forward in the process.2022 Examples: "We regret to inform you...", "We will not be proceeding with your application", "You have not been selected".20232024 Availability request2025 Assign this label if the company asks for your availability for a call, interview, or meeting.2026 Examples: "Please let us know your availability", "When are you free for a call?", "Can you share your available times?"20272028 Information request2029 Use this label if the company requests additional information, documents, or clarification.2030 Examples: "Please send your portfolio", "Can you provide references?", "We need more information about..."20312032 Assessment sent2033 Assign this label if the company sends a test, assignment, or assessment for you to complete as part of the hiring process.2034 Examples: "Please complete the attached assessment", "Here is your coding challenge", "Take-home assignment enclosed".20352036 Interview invitation2037 Use this label if the company invites you to an interview (phone, video, or onsite).2038 Examples: "We would like to invite you to interview", "Interview scheduled", "Please join us for an interview".20392040 Did not apply - inbound request2041 Assign this label if the company or recruiter reaches out to you first, and you did not apply for the position.2042 Examples: "We found your profile and would like to connect", "Are you interested in this opportunity?", "We came across your resume".20432044 Action required from company2045 Use this label if the next step is pending from the company, and you are waiting for their response or action.2046 Examples: "We will get back to you", "Awaiting feedback from the team", "We will contact you with next steps".20472048 Hiring freeze notification2049 Assign this label if the company notifies you that the position is on hold or canceled due to a hiring freeze.2050 Examples: "Position is on hold", "Hiring freeze in effect", "We are pausing recruitment".20512052 Withdrew application2053 Use this label if you (the candidate) have withdrawn your application, or the email confirms your withdrawal.2054 Examples: "You have withdrawn your application", "Thank you for letting us know you are no longer interested".20552056 Offer made2057 Assign this label if the company extends a job offer to you.2058 Examples: "We are pleased to offer you the position", "Offer letter attached", "Congratulations, you have been selected".20592060 False positive, not related to job search2061 Use this label if the email is not related to job applications, recruitment, or hiring.2062 Examples: Newsletters, spam, unrelated notifications, or personal emails.20632064 Informational outreach2065 Assign this label if the company or recruiter is reaching out to share information, updates, or opportunities, but not in direct response to an application or as an explicit invitation to apply.2066 Examples: "We wanted to let you know about upcoming roles", "Here’s information about our company", "General outreach about our hiring process".20672068 Provide the output in JSON format, for example: "company_name": "company_name", "job_application_status": "status", "job_title": "job_title"2069 Remove backticks. Only use double quotes. Enclose key and value pairs in a single pair of curly braces.2070 Email: {email_text}2071 """20722073 retries = 3 # Max retries2074 delay = 60 # Initial delay2075 for attempt in range(retries):2076 try:2077 logger.info("Calling generate_content")2078 response: GenerateTextResponse = model.generate_content(prompt)2079 response.resolve()2080 response_json: str = response.text2081 logger.info("Received response from model: %s", response_json)2082 if response_json:2083 cleaned_response_json = (2084 response_json.replace("json", "")2085 .replace("`", "")2086 .replace("'", '"')2087 .strip()2088 )2089 cleaned_response_json = (2090 response_json.replace("json", "")2091 .replace("`", "")2092 .replace("'", '"')2093 .strip()2094 )2095 logger.info("Cleaned response: %s", cleaned_response_json)2096 return json.loads(cleaned_response_json)2097 else:2098 logger.error("Empty response received from the model.")2099 return None2100 except Exception as e:2101 if "429" in str(e):2102 logger.warning(2103 f"Rate limit hit. Retrying in {delay} seconds (attempt {attempt + 1})."2104 )2105 time.sleep(delay)2106 else:2107 logger.error(f"process_email exception: {e}")2108 return None2109 logger.error(f"Failed to process email after {retries} attempts.")2110 return None2111211221132114---2115target_repo/backend/db/companies.py2116---2117from sqlmodel import SQLModel, Field, UniqueConstraint211821192120class Companies(SQLModel, table=True):2121 __tablename__ = "companies"2122 company_id: int = Field(default=None, primary_key=True)2123 company_name: str2124 company_email_domain: str21252126 __table_args__ = (2127 # Ensure that company_name and company_email_domain together are unique2128 UniqueConstraint(2129 "company_name",2130 "company_email_domain",2131 name="unique_company_name_and_domain",2132 ),2133 )213421352136---2137target_repo/backend/db/company_jobs.py2138---2139from sqlmodel import SQLModel, Field, UniqueConstraint2140from datetime import datetime214121422143class CompanyJobs(SQLModel, table=True):2144 __tablename__ = "company_jobs"2145 company_job_id: int = Field(default=None, primary_key=True)2146 company_id: int = Field(foreign_key="companies.company_id", nullable=False)2147 company_job_title_id: int | None = Field(2148 default=None, foreign_key="job_titles.job_title_id", nullable=True2149 )2150 company_job_description: str | None = Field(default=None, nullable=True)2151 company_job_posted_at: datetime = Field(2152 default_factory=datetime.utcnow, nullable=False2153 )2154 company_job_location: str | None = Field(default=None, nullable=True)21552156 __table_args__ = (2157 # Ensure that company_name and company_email_domain together are unique2158 UniqueConstraint(2159 "company_id",2160 "job_title_id",2161 "job_location",2162 "job_posted_at",2163 name="unique_job",2164 ),2165 )216621672168---2169target_repo/backend/db/job_status.py2170---2171from sqlmodel import SQLModel, Field217221732174class JobStatus(SQLModel, table=True):2175 __tablename__ = "job_statuses"2176 status_id: int = Field(default=None, primary_key=True)2177 status_name: str2178 status_description: str217921802181---2182target_repo/backend/db/job_titles.py2183---2184from sqlmodel import SQLModel, Field, UniqueConstraint218521862187class JobTitles(SQLModel, table=True):2188 __tablename__ = "job_titles"2189 job_title_id: int = Field(default=None, primary_key=True)2190 job_title: str21912192 __table_args__ = (UniqueConstraint("job_title", name="unique_job_title"),)219321942195---2196target_repo/backend/db/processing_tasks.py2197---2198from sqlmodel import Field, SQLModel, Relationship2199from datetime import datetime, timezone2200import sqlalchemy as sa2201from db.users import Users22022203FINISHED = "finished"2204STARTED = "started"220522062207class TaskRuns(SQLModel, table=True):2208 __tablename__ = "processing_task_runs"2209 user_id: str = Field(foreign_key="users.user_id", primary_key=True)2210 created: datetime = Field(default_factory=datetime.now, nullable=False)2211 updated: datetime = Field(2212 sa_column_kwargs={"onupdate": sa.func.now()},2213 default_factory=lambda: datetime.now(timezone.utc),2214 nullable=False,2215 )2216 status: str = Field(nullable=False)2217 total_emails: int = 02218 processed_emails: int = 022192220 user: Users = Relationship()222122222223---2224target_repo/backend/db/user_emails.py2225---2226from sqlmodel import SQLModel, Field2227from datetime import datetime22282229class UserEmails(SQLModel, table=True):2230 __tablename__ = "user_emails" 2231 id: str = Field(primary_key=True) # Gmail email ID (not unique globally)2232 user_id: str = Field(primary_key=True) # Unique per user (composite key)2233 company_name: str2234 application_status: str2235 received_at: datetime2236 subject: str2237 job_title: str2238 email_from: str # to avoid 'from' being a reserved key word22392240---2241target_repo/backend/db/user_job_status.py2242---2243from sqlmodel import SQLModel, Field224422452246class UserJobStatuses(SQLModel, table=True):2247 __tablename__ = "user_job_statuses"2248 user_job_status_id: int = Field(default=None, primary_key=True)2249 user_id: int = Field(foreign_key="users.user_id", nullable=False)2250 job_id: int = Field(foreign_key="company_jobs.job_id", nullable=False)2251 status_id: int = Field(foreign_key="job_statuses.status_id", nullable=False)225222532254---2255target_repo/backend/db/user_jobs.py2256---2257from sqlmodel import SQLModel, Field2258from datetime import datetime225922602261class UserJobs(SQLModel, table=True):2262 __tablename__ = "user_jobs"2263 user_job_id: int = Field(primary_key=True, nullable=False)2264 user_id: int = Field(foreign_key="users.user_id", nullable=False)2265 job_id: int = Field(foreign_key="company_jobs.job_id", nullable=False)2266 applied_at: datetime226722682269---2270target_repo/backend/db/user_session.py2271---2272from sqlmodel import SQLModel, Field2273from uuid import UUID, uuid42274from datetime import datetime, timezone2275from typing import Optional22762277class UserSession(SQLModel, table=True):2278 __tablename__ = "user_session"2279 id: UUID = Field(default_factory=uuid4, primary_key=True)2280 user_id: int = Field(foreign_key="users.user_id")2281 session_start: datetime = Field(default_factory=datetime.now(timezone.utc))2282 session_end: Optional[datetime] = None2283 user_agent: Optional[str] = None228422852286---2287target_repo/backend/db/users.py2288---2289from sqlmodel import SQLModel, Field2290from pydantic import BaseModel2291from datetime import datetime22922293class UserData(BaseModel):2294 user_id: str2295 user_email: str2296 start_date: datetime22972298class Users(SQLModel, table=True):2299 __tablename__ = "users"2300 user_id: str = Field(default = None, primary_key = True)2301 user_email: str = Field(nullable=False) 2302 start_date: datetime = Field(nullable=False) # Start date for job applications230323042305---2306target_repo/backend/db/utils/user_email_utils.py2307---2308from db.user_emails import UserEmails2309from datetime import datetime, timezone2310import email.utils2311import logging2312from database import engine2313from sqlmodel import Session, select23142315logger = logging.getLogger(__name__)23162317def parse_email_date(date_str: str) -> datetime:2318 """2319 Converts an email date string into a Python datetime object2320 """2321 dt = email.utils.parsedate_to_datetime(date_str)2322 if dt is None:2323 # default to current UTC datetime2324 dt = datetime.now(timezone.utc)2325 return dt232623272328def check_email_exists(user_id: str, email_id: str) -> bool:2329 """2330 Checks if an email with the given emailId and userId exists in the database.2331 """2332 with Session(engine) as session:2333 statement = select(UserEmails).where(2334 (UserEmails.user_id == user_id) & (UserEmails.id == email_id)2335 )2336 result = session.exec(statement).first()2337 return result is not None233823392340def create_user_email(user, message_data: dict) -> UserEmails:2341 """2342 Creates a UserEmail record instance from the provided data.2343 """2344 try:2345 received_at_str = message_data["received_at"]2346 received_at = parse_email_date(received_at_str) # parse_email_date function was created as different date formats were being pulled from the data2347 if check_email_exists(user.user_id, message_data["id"]):2348 logger.info(f"Email with ID {message_data['id']} already exists in the database.")2349 return None2350 return UserEmails(2351 id=message_data["id"],2352 user_id=user.user_id,2353 company_name=message_data["company_name"],2354 application_status=message_data["application_status"],2355 received_at=received_at,2356 subject=message_data["subject"],2357 job_title=message_data["job_title"],2358 email_from=message_data["from"]2359 )2360 except Exception as e:2361 logger.error(f"Error creating UserEmail record: {e}")2362 return None236323642365---2366target_repo/backend/db/utils/user_utils.py2367---2368import logging2369from typing import Optional, Tuple2370from db.user_emails import UserEmails2371from sqlmodel import Session, select, func2372from db.users import Users 2373from datetime import datetime, timedelta, timezone 23742375logger = logging.getLogger(__name__)23762377def get_last_email_date(user_id: str) -> Optional[datetime]:2378 from database import engine2379 """2380 Checks date of user's most recent email 23812382 """2383 with Session(engine) as session:2384 row = session.exec(2385 select(func.max(UserEmails.received_at))2386 .where(UserEmails.user_id == user_id)2387 ).one() # aggregates in SQL to a single row2388 return row23892390def user_exists(user) -> Tuple[bool, Optional[datetime]]:2391 from database import engine2392 """2393 Checks if user is already in the database23942395 """2396 with Session(engine) as session:2397 existing_user = session.exec(select(Users).where(Users.user_id == user.user_id)).first()2398 if not existing_user:2399 return False, None2400 else:2401 last_fetched_date = get_last_email_date(user.user_id)2402 return True, last_fetched_date24032404def add_user(user, request, start_date=None) -> Users:2405 """2406 Writes user data to the users model and session storage24072408 """2409 from database import engine2410 with Session(engine) as session:2411 # Check if the user already exists in the database2412 existing_user = session.exec(select(Users).where(Users.user_id == user.user_id)).first()24132414 if not existing_user:24152416 start_date = getattr(user, "start_date", None) or (datetime.now(timezone.utc) - timedelta(days=90))24172418 if isinstance(start_date, datetime):2419 start_date = start_date.strftime("%Y-%m-%d")24202421 # add a new user record2422 new_user = Users(2423 user_id=user.user_id,2424 user_email=user.user_email,2425 start_date=start_date2426 )24272428 session.add(new_user)2429 session.commit()2430 session.refresh(new_user)2431 logger.info(f"Created new user record for user_id: {user.user_id}")24322433 # Write start date to session storage2434 if isinstance(start_date, str):2435 request.session["start_date"] = start_date # Already a string, no need to convert2436 else:2437 request.session["start_date"] = start_date.isoformat() # Convert only if it's a datetime object24382439 return new_user2440 else:2441 logger.info(f"User {user.user_id} already exists in the database.")2442 return existing_user24432444---2445target_repo/backend/alembic/env.py2446---2447from logging.config import fileConfig24482449from sqlalchemy import engine_from_config2450from sqlalchemy import pool24512452from alembic import context24532454# Import your SQLAlchemy models/metadata2455import sys2456import os2457sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))24582459from sqlmodel import SQLModel24602461# This is the Alembic Config object2462config = context.config24632464# Interpret the config file for Python logging2465if config.config_file_name is not None:2466 fileConfig(config.config_file_name)24672468# Set target metadata to SQLModel's metadata since that's likely what your models use2469target_metadata = SQLModel.metadata24702471# other values from the config, defined by the needs of env.py,2472# can be acquired:2473# my_important_option = config.get_main_option("my_important_option")2474# ... etc.247524762477def run_migrations_offline() -> None:2478 """Run migrations in 'offline' mode."""2479 url = config.get_main_option("sqlalchemy.url")2480 context.configure(2481 url=url,2482 target_metadata=target_metadata,2483 literal_binds=True,2484 dialect_opts={"paramstyle": "named"},2485 )24862487 with context.begin_transaction():2488 context.run_migrations()248924902491def run_migrations_online() -> None:2492 """Run migrations in 'online' mode."""2493 connectable = engine_from_config(2494 config.get_section(config.config_ini_section),2495 prefix="sqlalchemy.",2496 poolclass=pool.NullPool,2497 )24982499 with connectable.connect() as connection:2500 context.configure(2501 connection=connection, target_metadata=target_metadata2502 )25032504 with context.begin_transaction():2505 context.run_migrations()250625072508if context.is_offline_mode():2509 run_migrations_offline()2510else:2511 run_migrations_online()251225132514---2515target_repo/backend/alembic/versions/6240656d52f6_add_job_title_column.py2516---2517"""add_job_title_column25182519Revision ID: 6240656d52f62520Revises: b240c664ed462521Create Date: 2025-03-16 21:31:17.48627525222523"""2524from typing import Sequence, Union25252526from alembic import op2527import sqlalchemy as sa252825292530# revision identifiers, used by Alembic.2531revision: str = '6240656d52f6'2532down_revision: Union[str, None] = 'b240c664ed46'2533branch_labels: Union[str, Sequence[str], None] = None2534depends_on: Union[str, Sequence[str], None] = None253525362537def upgrade() -> None:2538 """Add job_title column to the relevant table."""2539 op.add_column('user_email', sa.Column('job_title', sa.String(255), nullable=True))2540254125422543def downgrade() -> None:2544 """Remove job_title column."""2545 op.drop_column('user_email', 'job_title')25462547---2548target_repo/backend/alembic/versions/b240c664ed46_change_user_email_id_to_varchar.py2549---2550"""change_user_email_id_to_varchar25512552Revision ID: b240c664ed462553Revises: 2554Create Date: 2025-03-16 02:58:30.32599225552556"""2557from typing import Sequence, Union25582559from alembic import op2560import sqlalchemy as sa2561from sqlalchemy.dialects import postgresql25622563# revision identifiers, used by Alembic.2564revision: str = 'b240c664ed46'2565down_revision: Union[str, None] = None2566branch_labels: Union[str, Sequence[str], None] = None2567depends_on: Union[str, Sequence[str], None] = None256825692570def upgrade() -> None:2571 """Change user_email.id column from integer to varchar and create composite primary key."""2572 # First, drop any constraints that depend on the id column2573 op.execute('ALTER TABLE user_email DROP CONSTRAINT IF EXISTS user_email_pkey')2574 2575 # Change the column type2576 op.alter_column('user_email', 'id', 2577 existing_type=sa.INTEGER(), 2578 type_=sa.VARCHAR(255),2579 postgresql_using='id::varchar')2580 2581 # Add composite primary key constraint2582 op.execute('ALTER TABLE user_email ADD PRIMARY KEY (id, user_id)')258325842585def downgrade() -> None:2586 """Revert to integer id column with appropriate primary key."""2587 # Drop the composite primary key2588 op.execute('ALTER TABLE user_email DROP CONSTRAINT IF EXISTS user_email_pkey')2589 2590 # Change id back to integer (with potential data loss warning if non-numeric ids exist)2591 op.alter_column('user_email', 'id',2592 existing_type=sa.VARCHAR(255),2593 type_=sa.INTEGER(),2594 postgresql_using='id::integer')2595 2596 # Restore original primary key on id only2597 op.execute('ALTER TABLE user_email ADD PRIMARY KEY (id)')259825992600---2601target_repo/backend/alembic/versions/c256d0279ea6_rename_user_email_table_to_plural.py2602---2603"""rename_user_email_table_to_plural26042605Revision ID: c256d0279ea62606Revises: 6240656d52f62607Create Date: 2025-03-17 03:16:53.07842026082609"""2610from typing import Sequence, Union26112612from alembic import op2613import sqlalchemy as sa261426152616# revision identifiers, used by Alembic.2617revision: str = 'c256d0279ea6'2618down_revision: Union[str, None] = '6240656d52f6'2619branch_labels: Union[str, Sequence[str], None] = None2620depends_on: Union[str, Sequence[str], None] = None262126222623def upgrade() -> None:2624 """Rename user_email table to user_emails."""2625 op.rename_table('user_email', 'user_emails')262626272628def downgrade() -> None:2629 """Rename user_emails table back to user_email."""2630 op.rename_table('user_emails', 'user_email')26312632---2633target_repo/backend/routes/auth_routes.py2634---2635import datetime2636import logging2637from fastapi import APIRouter, Depends, HTTPException, Request, BackgroundTasks2638from fastapi.responses import RedirectResponse, HTMLResponse2639from google_auth_oauthlib.flow import Flow26402641from db.utils.user_utils import user_exists2642from utils.auth_utils import AuthenticatedUser2643from session.session_layer import create_random_session_string, validate_session2644from utils.config_utils import get_settings2645from utils.cookie_utils import set_conditional_cookie2646from routes.email_routes import fetch_emails_to_db2647from slowapi import Limiter2648from slowapi.util import get_remote_address26492650limiter = Limiter(key_func=get_remote_address)26512652# Logger setup2653logger = logging.getLogger(__name__)26542655# Get settings2656settings = get_settings()26572658# FastAPI router for Google login2659router = APIRouter()26602661APP_URL = settings.APP_URL26622663@router.get("/login")2664@limiter.limit("10/minute")2665async def login(request: Request, background_tasks: BackgroundTasks):2666 """Handles Google OAuth2 login and authorization code exchange."""2667 code = request.query_params.get("code")2668 flow = Flow.from_client_secrets_file(2669 settings.CLIENT_SECRETS_FILE,2670 settings.GOOGLE_SCOPES,2671 redirect_uri=settings.REDIRECT_URI,2672 )26732674 try:2675 if not code:2676 authorization_url, state = flow.authorization_url(prompt="consent")2677 return RedirectResponse(url=authorization_url)2678 logger.info("Authorization code received, exchanging for token...")2679 try:2680 flow.fetch_token(code=code)2681 except Exception as e:2682 logger.error("Failed to fetch token: %s", e)2683 return RedirectResponse(2684 url=f"{settings.APP_URL}/errors?message=permissions_error",2685 status_code=3032686 ) 2687 try:2688 creds = flow.credentials2689 except Exception as e:2690 logger.error("Failed to fetch credentials: %s", e)2691 return RedirectResponse(2692 url=f"{settings.APP_URL}/errors?message=credentials_error",2693 status_code=3032694 ) 26952696 if not creds.valid:2697 creds.refresh(Request())2698 return RedirectResponse("/login", status_code=303)26992700 user = AuthenticatedUser(creds)2701 session_id = request.session["session_id"] = create_random_session_string()27022703 # Set session details2704 try:2705 token_expiry = creds.expiry.isoformat()2706 except Exception as e:2707 logger.error("Failed to parse token expiry: %s", e)2708 token_expiry = (2709 datetime.datetime.utcnow() + datetime.timedelta(hours=1)2710 ).isoformat()27112712 request.session["token_expiry"] = token_expiry2713 request.session["user_id"] = user.user_id2714 request.session["creds"] = creds.to_json() 2715 request.session["access_token"] = creds.token27162717 # NOTE: change redirection once dashboard is completed2718 exists, last_fetched_date = user_exists(user)2719 if exists:2720 logger.info("User already exists in the database.")2721 response = RedirectResponse(2722 url=f"{settings.APP_URL}/processing", status_code=3032723 )2724 background_tasks.add_task(fetch_emails_to_db, user, request, last_fetched_date, user_id=user.user_id)2725 logger.info("Background task started for user_id: %s", user.user_id)2726 else:2727 request.session["is_new_user"] = True2728 response = RedirectResponse(2729 url=f"{settings.APP_URL}/dashboard", status_code=3032730 )2731 print("User does not exist")27322733 response = set_conditional_cookie(2734 key="Authorization", value=session_id, response=response2735 )27362737 return response2738 except Exception as e:2739 logger.error("Login error: %s", e)2740 return HTMLResponse(content="An error occurred, sorry!", status_code=500)274127422743@router.get("/logout")2744async def logout(request: Request, response: RedirectResponse):2745 logger.info("Logging out")2746 request.session.clear()2747 response.delete_cookie(key="__Secure-Authorization")2748 response.delete_cookie(key="Authorization")2749 return RedirectResponse(f"{APP_URL}", status_code=303)275027512752@router.get("/me")2753async def getUser(request: Request, user_id: str = Depends(validate_session)):2754 if not user_id:2755 raise HTTPException(2756 status_code=401, detail="No user id found in session"2757 ) 2758 return {"user_id": user_id}27592760---2761target_repo/backend/routes/email_routes.py2762---2763import logging2764from typing import List, Optional2765from fastapi import APIRouter, Depends, Request, HTTPException, BackgroundTasks2766from fastapi.responses import HTMLResponse, JSONResponse, RedirectResponse2767from sqlmodel import Session, select, desc2768from googleapiclient.discovery import build2769from db.user_emails import UserEmails2770from db import processing_tasks as task_models2771from db.utils.user_email_utils import create_user_email2772from utils.auth_utils import AuthenticatedUser2773from utils.email_utils import get_email_ids, get_email2774from utils.llm_utils import process_email2775from utils.config_utils import get_settings2776from session.session_layer import validate_session2777import database2778from google.oauth2.credentials import Credentials2779import json2780from start_date.storage import get_start_date_email_filter2781from constants import QUERY_APPLIED_EMAIL_FILTER2782from datetime import datetime, timedelta2783from slowapi import Limiter2784from slowapi.util import get_remote_address27852786limiter = Limiter(key_func=get_remote_address)27872788# Logger setup2789logger = logging.getLogger(__name__)27902791# Get settings2792settings = get_settings()2793APP_URL = settings.APP_URL27942795SECONDS_BETWEEN_FETCHING_EMAILS = 1 * 60 * 60 # 1 hour27962797# FastAPI router for email routes2798router = APIRouter()27992800@router.get("/processing", response_class=HTMLResponse)2801async def processing(request: Request, db_session: database.DBSession, user_id: str = Depends(validate_session)):2802 logging.info("user_id:%s processing", user_id)2803 if not user_id:2804 logger.info("user_id: not found, redirecting to login")2805 return RedirectResponse("/logout", status_code=303)28062807 process_task_run: task_models.TaskRuns = db_session.get(task_models.TaskRuns, user_id)28082809 if process_task_run is None:2810 raise HTTPException(2811 status_code=404, detail="Processing has not started."2812 )28132814 if process_task_run.status == task_models.FINISHED:2815 logger.info("user_id: %s processing complete", user_id)2816 return JSONResponse(2817 content={2818 "message": "Processing complete",2819 "processed_emails": process_task_run.processed_emails,2820 "total_emails": process_task_run.total_emails,2821 }2822 )2823 else:2824 logger.info("user_id: %s processing not complete for file", user_id)2825 return JSONResponse(2826 content={2827 "message": "Processing in progress",2828 "processed_emails": process_task_run.processed_emails,2829 "total_emails": process_task_run.total_emails,2830 }2831 )283228332834@router.get("/get-emails", response_model=List[UserEmails])2835@limiter.limit("5/minute")2836def query_emails(request: Request, db_session: database.DBSession, user_id: str = Depends(validate_session)) -> None:2837 try:2838 logger.info(f"Fetching emails for user_id: {user_id}")28392840 # Query emails sorted by date (newest first)2841 statement = select(UserEmails).where(UserEmails.user_id == user_id).order_by(desc(UserEmails.received_at))2842 user_emails = db_session.exec(statement).all()28432844 logger.info(f"Found {len(user_emails)} emails for user_id: {user_id}")2845 return user_emails # Return empty list if no emails exist28462847 except Exception as e:2848 logger.error(f"Error fetching emails for user_id {user_id}: {e}")2849 raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")2850 28512852@router.delete("/delete-email/{email_id}")2853async def delete_email(request: Request, db_session: database.DBSession, email_id: str, user_id: str = Depends(validate_session)):2854 """2855 Delete an email record by its ID for the authenticated user.2856 """2857 try:2858 # Query the email record to ensure it exists and belongs to the user2859 email_record = db_session.exec(2860 select(UserEmails).where(2861 (UserEmails.id == email_id) & (UserEmails.user_id == user_id)2862 )2863 ).first()28642865 if not email_record:2866 logger.warning(f"Email with id {email_id} not found for user_id {user_id}")2867 raise HTTPException(2868 status_code=404, detail=f"Email with id {email_id} not found"2869 )28702871 # Delete the email record2872 db_session.delete(email_record)2873 db_session.flush()28742875 logger.info(f"Email with id {email_id} deleted successfully for user_id {user_id}")2876 return {"message": "Item deleted successfully"}28772878 except Exception as e:2879 logger.error(f"Error deleting email with id {email_id} for user_id {user_id}: {e}")2880 raise HTTPException(2881 status_code=500, detail=f"Failed to delete email: {str(e)}"2882 )2883 28842885@router.post("/fetch-emails")2886@limiter.limit("5/minute")2887async def start_fetch_emails(2888 request: Request, background_tasks: BackgroundTasks, user_id: str = Depends(validate_session)2889):2890 """Starts the background task for fetching and processing emails."""2891 2892 if not user_id:2893 raise HTTPException(status_code=403, detail="Unauthorized")2894 logger.info(f"user_id:{user_id} start_fetch_emails")2895 # Retrieve stored credentials2896 creds_json = request.session.get("creds")2897 if not creds_json:2898 logger.error(f"Missing credentials for user_id: {user_id}")2899 return HTMLResponse(content="User not authenticated. Please log in again.", status_code=401)29002901 try:2902 # Convert JSON string back to Credentials object2903 creds_dict = json.loads(creds_json)2904 creds = Credentials.from_authorized_user_info(creds_dict) # Convert dict to Credentials2905 user = AuthenticatedUser(creds)29062907 logger.info(f"Starting email fetching process for user_id: {user_id}")29082909 # Start email fetching in the background2910 background_tasks.add_task(fetch_emails_to_db, user, request, user_id=user_id)29112912 return JSONResponse(content={"message": "Email fetching started"}, status_code=200)2913 except Exception as e:2914 logger.error(f"Error reconstructing credentials: {e}")2915 raise HTTPException(status_code=500, detail="Failed to authenticate user")291629172918def fetch_emails_to_db(user: AuthenticatedUser, request: Request, last_updated: Optional[datetime] = None, *, user_id: str) -> None:2919 logger.info(f"Fetching emails to db for user_id: {user_id}")29202921 with Session(database.engine) as db_session:2922 # we track starting and finishing fetching of emails for each user2923 process_task_run = (2924 db_session.query(task_models.TaskRuns).filter_by(user_id=user_id).one_or_none()2925 )2926 if process_task_run is None:2927 # if this is the first time running the task for the user, create a record2928 process_task_run = task_models.TaskRuns(user_id=user_id)2929 db_session.add(process_task_run)2930 elif datetime.now() - process_task_run.updated < timedelta(2931 seconds=SECONDS_BETWEEN_FETCHING_EMAILS2932 ):2933 # limit how frequently emails can be fetched by a specific user2934 logger.warning(2935 "Less than an hour since last fetch of emails for user",2936 extra={"user_id": user_id},2937 )2938 return29392940 # this is helpful if the user applies for a new job and wants to rerun the analysis during the same session2941 process_task_run.processed_emails = 02942 process_task_run.total_emails = 02943 process_task_run.status = task_models.STARTED29442945 db_session.commit() # sync with the database so calls in the future reflect the task is already started29462947 start_date = request.session.get("start_date")2948 logger.info(f"start_date: {start_date}")2949 start_date_query = get_start_date_email_filter(start_date)2950 is_new_user = request.session.get("is_new_user")29512952 query = start_date_query2953 # check for users last updated email2954 if last_updated:2955 # this converts our date time to number of seconds 2956 additional_time = int(last_updated.timestamp())2957 # we append it to query so we get only emails recieved after however many seconds2958 # for example, if the newest email you’ve stored was received at 2025‑03‑20 14:32 UTC, we convert that to 1710901920s 2959 # and tell Gmail to fetch only messages received after March 20, 2025 at 14:32 UTC.2960 if not start_date or not is_new_user:2961 query = QUERY_APPLIED_EMAIL_FILTER2962 query += f" after:{additional_time}"2963 2964 logger.info(f"user_id:{user_id} Fetching emails after {last_updated.isoformat()}")2965 else:2966 logger.info(f"user_id:{user_id} Fetching all emails (no last_date maybe with start date)")2967 logger.info(2968 f"user_id:{user_id} Fetching all emails (no last_date maybe with start date)"2969 )29702971 service = build("gmail", "v1", credentials=user.creds)29722973 messages = get_email_ids(2974 query=query, gmail_instance=service2975 )2976 # Update session to remove "new user" status2977 request.session["is_new_user"] = False29782979 if not messages:2980 logger.info(f"user_id:{user_id} No job application emails found.")2981 process_task_run = db_session.get(task_models.TaskRuns, user_id)2982 process_task_run.status = task_models.FINISHED2983 db_session.commit()2984 return29852986 logger.info(f"user_id:{user.user_id} Found {len(messages)} emails.")2987 process_task_run.total_emails = len(messages)2988 db_session.commit()29892990 email_records = [] # list to collect email records29912992 for idx, message in enumerate(messages):2993 message_data = {}2994 # (email_subject, email_from, email_domain, company_name, email_dt)2995 msg_id = message["id"]2996 logger.info(2997 f"user_id:{user_id} begin processing for email {idx + 1} of {len(messages)} with id {msg_id}"2998 )2999 process_task_run.processed_emails = idx + 13000 db_session.commit()30013002 msg = get_email(message_id=msg_id, gmail_instance=service)30033004 if msg:3005 try:3006 result = process_email(msg["text_content"])3007 # if values are empty strings or null, set them to "unknown"3008 for key in result.keys():3009 if not result[key]:3010 result[key] = "unknown"3011 except Exception as e:3012 logger.error(3013 f"user_id:{user_id} Error processing email {idx + 1} of {len(messages)} with id {msg_id}: {e}"3014 )30153016 if not isinstance(result, str) and result:3017 logger.info(3018 f"user_id:{user_id} successfully extracted email {idx + 1} of {len(messages)} with id {msg_id}"3019 )3020 if result.get("job_application_status").lower() == "false positive, not related to job search":3021 logger.info(3022 f"user_id:{user_id} email {idx + 1} of {len(messages)} with id {msg_id} is a false positive, not related to job search"3023 )3024 continue # skip this email if it's a false positive3025 else: # processing returned unknown which is also likely false positive3026 logger.warning(3027 f"user_id:{user_id} failed to extract email {idx + 1} of {len(messages)} with id {msg_id}"3028 )3029 result = {"company_name": "unknown", "application_status": "unknown", "job_title": "unknown"}30303031 message_data = {3032 "id": msg_id,3033 "company_name": result.get("company_name", "unknown"),3034 "application_status": result.get("job_application_status", "unknown"),3035 "received_at": msg.get("date", "unknown"),3036 "subject": msg.get("subject", "unknown"),3037 "job_title": result.get("job_title", "unknown"),3038 "from": msg.get("from", "unknown"),3039 }3040 email_record = create_user_email(user, message_data)3041 if email_record:3042 email_records.append(email_record)30433044 # batch insert all records at once3045 if email_records:3046 db_session.add_all(email_records)3047 logger.info(3048 f"Added {len(email_records)} email records for user {user_id}"3049 )30503051 process_task_run.status = task_models.FINISHED3052 db_session.commit()30533054 logger.info(f"user_id:{user_id} Email fetching complete.")305530563057---3058target_repo/backend/routes/file_routes.py3059---3060import csv3061import os3062import logging3063import plotly.graph_objects as go3064from fastapi import APIRouter, HTTPException, Request, Depends3065from fastapi.responses import FileResponse, RedirectResponse3066from slowapi import Limiter3067from slowapi.util import get_remote_address3068import database3069from utils.file_utils import get_user_filepath3070from session.session_layer import validate_session3071from routes.email_routes import query_emails307230733074# Logger setup3075logger = logging.getLogger(__name__)30763077# FastAPI router for file routes3078router = APIRouter()3079limiter = Limiter(key_func=get_remote_address)30803081@router.get("/download-file")3082async def download_file(request: Request, user_id: str = Depends(validate_session)):3083 if not user_id:3084 return RedirectResponse("/logout", status_code=303)3085 directory = get_user_filepath(user_id)3086 filename = "emails.csv"3087 filepath = f"{directory}/{filename}"3088 if os.path.exists(filepath):3089 logger.info("user_id:%s downloading from filepath %s", user_id, filepath)3090 return FileResponse(filepath)3091 raise HTTPException(status_code=400, detail="File not found")309230933094@router.get("/write-to-csv")3095async def write_to_csv(request: Request, db_session: database.DBSession, user_id: str = Depends(validate_session)):3096 if not user_id:3097 return RedirectResponse("/logout", status_code=303)30983099 # Get job related email data from DB3100 emails = query_emails(request, db_session=db_session, user_id=user_id)3101 if not emails:3102 raise HTTPException(status_code=400, detail="No data found to write")31033104 directory = get_user_filepath(user_id)3105 os.makedirs(directory, exist_ok=True) # Ensure the directory exists31063107 filename = "emails.csv"3108 filepath = os.path.join(directory, filename)31093110 # Key: DB field name; Value: Human-readable field name3111 field_mapping = {3112 "company_name": "Company Name",3113 "application_status": "Application Status",3114 "received_at": "Received At",3115 "subject": "Subject",3116 "email_from": "Sender"3117 }31183119 selected_fields = list(field_mapping.keys())3120 headers = list(field_mapping.values())31213122 # Filter out unwanted fields3123 processed_emails = [3124 {key: value for key, value in email if key in selected_fields} for email in emails3125 ]31263127 # Write to CSV3128 with open(filepath, mode="w", newline="") as file:3129 writer = csv.writer(file)3130 writer.writerow(headers)3131 for row in processed_emails:3132 writer.writerow([row[field] for field in selected_fields])31333134 logger.info("CSV file created at %s", filepath)3135 return {"message": f"CSV file written successfully at {filepath}"}313631373138# Write and download csv3139@router.get("/process-csv")3140@limiter.limit("2/minute")3141async def process_csv(request: Request, db_session: database.DBSession, user_id: str = Depends(validate_session)):3142 if not user_id:3143 return RedirectResponse("/logout", status_code=303)31443145 directory = get_user_filepath(user_id)3146 filename = "emails.csv"3147 filepath = os.path.join(directory, filename)3148 3149 # Get job related email data from DB3150 emails = query_emails(request, db_session=db_session, user_id=user_id)3151 if not emails:3152 raise HTTPException(status_code=400, detail="No data found to write")3153 # Ensure the directory exists3154 os.makedirs(directory, exist_ok=True)31553156 # Key: DB field name; Value: Human-readable field name3157 field_mapping = {3158 "company_name": "Company Name",3159 "application_status": "Application Status",3160 "received_at": "Received At",3161 "job_title": "Job Title",3162 "subject": "Subject",3163 "email_from": "Sender"3164 }31653166 selected_fields = list(field_mapping.keys())3167 headers = list(field_mapping.values())31683169 # Filter out unwanted fields3170 processed_emails = [3171 {key: value for key, value in email if key in selected_fields} for email in emails3172 ]31733174 # Write to CSV3175 with open(filepath, mode="w", newline="") as file:3176 writer = csv.writer(file)3177 writer.writerow(headers)3178 for row in processed_emails:3179 writer.writerow([row[field] for field in selected_fields])31803181 logger.info("CSV file created at %s", filepath)3182 3183 # Download CSV3184 if os.path.exists(filepath):3185 logger.info("user_id:%s downloading from filepath %s", user_id, filepath)3186 return FileResponse(filepath)3187 3188 # File not found error3189 raise HTTPException(status_code=400, detail="File not found")319031913192# Write and download sankey diagram3193@router.get("/process-sankey")3194@limiter.limit("2/minute")3195async def process_sankey(request: Request, db_session: database.DBSession, user_id: str = Depends(validate_session)):3196 # Validate user session, redirect if invalid3197 if not user_id:3198 return RedirectResponse("/logout", status_code=303)3199 3200 num_applications = 03201 num_offers = 03202 num_rejected = 03203 num_request_for_availability = 03204 num_interview_scheduled = 03205 num_no_response = 032063207 # Get job related email data from DB3208 emails = query_emails(request, db_session=db_session, user_id=user_id)3209 if not emails:3210 raise HTTPException(status_code=400, detail="No data found to write")3211 3212 for email in emails:3213 # normalize the output3214 status = email.application_status.strip().lower()3215 num_applications += 1 3216 if status == "offer":3217 num_offers += 13218 elif status == "rejected":3219 num_rejected += 13220 elif status == "request for availability":3221 num_request_for_availability += 13222 elif status == "interview scheduled":3223 num_interview_scheduled += 13224 elif status == "no response":3225 num_no_response += 132263227 # Create the Sankey diagram3228 fig = go.Figure(go.Sankey(3229 node=dict(label=[f"Applications ({num_applications})", 3230 f"Offers ({num_offers})", 3231 f"Rejected ({num_rejected})", 3232 f"Request for Availability ({num_request_for_availability})", 3233 f"Interview Scheduled ({num_interview_scheduled})", 3234 f"No Response ({num_no_response})"]),3235 link=dict(source=[0, 0, 0, 0, 0], target=[1, 2, 3, 4, 5], 3236 value=[num_offers, num_rejected, num_request_for_availability, num_interview_scheduled, num_no_response])))323732383239 # Define the user's file path and ensure the directory exists3240 directory = get_user_filepath(user_id)3241 filename = "sankey_diagram.png"3242 filepath = os.path.join(directory, filename)32433244 # Ensure the directory exists3245 os.makedirs(directory, exist_ok=True)32463247 try:3248 # Save the Sankey diagram as PNG3249 fig.write_image(filepath) # Requires Kaleido for image export3250 logger.info("user_id:%s Sankey diagram saved to %s", user_id, filepath)32513252 # Return the file with correct headers and explicit filename3253 return FileResponse(3254 filepath,3255 media_type="image/png", # Correct media type for PNG3256 filename=filename, 3257 headers={"Content-Disposition": f"attachment; filename={filename}"} # Ensure correct filename in header3258 )3259 except Exception as e:3260 logger.error("Error generating Sankey diagram for user_id:%s - %s", user_id, str(e))3261 raise HTTPException(status_code=500, detail="Error generating Sankey diagram")32623263 32643265---3266target_repo/backend/routes/start_date_routes.py3267---3268import logging3269from fastapi import APIRouter, Request, Form, Depends3270from fastapi.responses import JSONResponse, HTMLResponse3271from db.utils.user_utils import add_user3272import json3273from utils.auth_utils import AuthenticatedUser3274from google.oauth2.credentials import Credentials3275from session.session_layer import validate_session3276from slowapi import Limiter3277from slowapi.util import get_remote_address32783279limiter = Limiter(key_func=get_remote_address)32803281# Logger setup3282logger = logging.getLogger(__name__)32833284api_call_finished = False32853286# FastAPI router for email routes3287router = APIRouter()32883289@router.post("/set-start-date")3290@limiter.limit("1/minute")3291async def set_start_date(request: Request, start_date: str = Form(...), user_id: str = Depends(validate_session)):3292 """Updates the user's job search start date in the database."""3293 user_id = request.session.get("user_id")32943295 if not user_id:3296 return HTMLResponse(content="Invalid request. Please log in again.", status_code=400)32973298 # Retrieve stored credentials3299 creds_json = request.session.get("creds")3300 if not creds_json:3301 logger.error(f"user_id:{user_id} missing credentials /set-start-date")3302 return HTMLResponse(content="User not authenticated. Please log in again.", status_code=401)33033304 try:3305 # Convert JSON string back to Credentials object3306 creds_dict = json.loads(creds_json)3307 creds = Credentials.from_authorized_user_info(creds_dict) # Convert dict to Credentials3308 user = AuthenticatedUser(creds, start_date) # Corrected: Now passing Credentials object33093310 # Save start date in DB3311 add_user(user, request, start_date)33123313 # Update session to remove "new user" status3314 request.session["is_new_user"] = False33153316 logger.info(f"user_id:{user_id} added start date {start_date}")33173318 return JSONResponse(content={"message": "Start date updated successfully"}, status_code=200)3319 except Exception as e:3320 logger.error(f"Error reconstructing credentials: {e}")3321 return HTMLResponse(content="Failed to save start date. Try again.", status_code=500)3322 3323def get_start_date(request: Request, user_id: str = Depends(validate_session)) -> str:3324 """Fetches the user's job search start date from the database."""3325 # Query the database for the user's start date3326 logger.info(f"Getting start date for user_id: {user_id}")3327 return request.session.get("start_date")332833293330@router.get("/api/session-data")3331@limiter.limit("5/minute")3332async def get_session_data(request: Request, user_id: str = Depends(validate_session)):3333 """Fetches session data for the user."""3334 3335 user_id = request.session.get("user_id")3336 token_expiry = request.session.get("token_expiry")3337 session_id = request.session.get("session_id")3338 is_new_user = request.session.get("is_new_user", False)33393340 logger.info(f"Fetching session data: user_id={user_id}, session_id={session_id}")33413342 if not user_id:3343 logger.warning("Session data missing user_id. Possible expired or invalid session.")3344 return JSONResponse(content={"error": "Session expired or invalid"}, status_code=401)33453346 session_data = {3347 "user_id": user_id,3348 "token_expiry": token_expiry,3349 "session_id": session_id,3350 "is_new_user": is_new_user,3351 }33523353 logger.info(f"Session data being returned: {session_data}")33543355 return JSONResponse(content=session_data)33563357---3358target_repo/backend/routes/users_routes.py3359---3360import logging3361from fastapi import APIRouter, Depends, Request, HTTPException3362from sqlmodel import select3363from db.user_emails import UserEmails3364from utils.config_utils import get_settings3365from session.session_layer import validate_session3366from routes.email_routes import query_emails3367import database3368from slowapi import Limiter3369from slowapi.util import get_remote_address337033713372# Logger setup3373logger = logging.getLogger(__name__)33743375# Get settings3376settings = get_settings()3377APP_URL = settings.APP_URL33783379api_call_finished = False33803381# FastAPI router for email routes3382router = APIRouter()3383limiter = Limiter(key_func=get_remote_address)33843385@router.get("/get-response-rate") 3386@limiter.limit("2/minute") 3387def response_rate_by_job_title(request: Request, db_session: database.DBSession, user_id: str = Depends(validate_session)):3388 3389 try:3390 # Get job related email data from DB3391 user_emails = query_emails(request, db_session=db_session, user_id=user_id)33923393 index = 033943395 # Tracks all job titles and their index in response_rate3396 job_titles = {}33973398 # Store (company, job_title) tuples to avoid duplicates3399 companies = []34003401 # List of dictionaries to store job titles and their response rates3402 response_rate_data = []34033404 for email in user_emails:3405 if email.job_title not in job_titles:3406 status = email.application_status.strip().lower()3407 if status == "request for availability" or status == "offer" or status == "interview scheduled":3408 response_rate_data.append({"title": email.job_title, "responses": 1, "total": 1})3409 else:3410 response_rate_data.append({"title": email.job_title, "responses": 0, "total": 1})3411 companies.append((email.company_name, email.job_title))3412 job_titles[email.job_title] = index3413 index += 13414 elif (email.company_name, email.job_title) not in companies:3415 status = email.application_status.strip().lower()3416 if status == "request for availability" or status == "offer" or status == "interview scheduled":3417 response_rate_data[job_titles[email.job_title]]["responses"] += 13418 response_rate_data[job_titles[email.job_title]]["total"] += 13419 companies.append((email.company_name, email.job_title))34203421 response_rate = []3422 for data in response_rate_data:3423 response_rate.append({3424 "title": data["title"],3425 "rate": round(data["responses"] / data["total"] * 100, 2)3426 })34273428 return response_rate3429 3430 except Exception as e:3431 logger.error(f"Error fetching job titles for user_id {user_id}: {e}")3432 raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")34333434@router.get("/user-response-rate")3435def calculate_response_rate(3436 request: Request, db_session: database.DBSession, user_id: str = Depends(validate_session)3437) -> None:3438 user_emails = db_session.exec(3439 select(UserEmails).where(UserEmails.user_id == user_id)3440 ).all()34413442 # if user has no application just return 0.03443 total_apps = len(user_emails)3444 if total_apps == 0:3445 return 0.034463447 interview_requests = 03448 for email in user_emails:3449 # using request for avalability as an interview request as it should come before the offer and scheduled interview3450 if (3451 email.application_status3452 and email.application_status.lower() == "request for availability"3453 ):3454 interview_requests += 134553456 response_rate_percent = (interview_requests / total_apps) * 1003457 return {"value": round(response_rate_percent, 1)}3458 345934603461---3462target_repo/backend/session/session_layer.py3463---3464# app/session/session_layer.py3465import logging3466import secrets3467from datetime import datetime3468from fastapi import Request3469from utils.config_utils import get_settings34703471settings = get_settings()34723473def create_random_session_string() -> str:3474 return secrets.token_urlsafe(32) # Generates a random URL-safe string347534763477def validate_session(request: Request) -> str:3478 """Retrieves Authorization, session_id, access_token and token_expiry3479 from request cookies and validates them.3480 Session ID should match the stored session.3481 Access token should not be expired.3482 """3483 if settings.is_publicly_deployed:3484 session_authorization = request.cookies.get("__Secure-Authorization")3485 else:3486 session_authorization = request.cookies.get("Authorization")34873488 session_id = request.session.get("session_id")3489 session_access_token = request.session.get("access_token")3490 token_exp = request.session.get("token_expiry")3491 user_id = request.session.get("user_id")34923493 if not session_authorization and not session_access_token:3494 logging.info(3495 "No Authorization and access_token in session, redirecting to login"3496 )3497 return ""34983499 if session_authorization != session_id:3500 logging.info("Authorization does not match Session Id, redirecting to login")3501 return ""35023503 if is_token_expired(token_exp):3504 logging.info("Access_token is expired, redirecting to login")3505 return ""35063507 logging.info("Valid Session, Access granted.")3508 return user_id350935103511def is_token_expired(iso_expiry: str) -> bool:3512 """3513 Converts ISO format timestamp (which serves as the expiry time of the token) to datetime.3514 If the current time is greater than the expiry time,3515 the token is expired.3516 """3517 if iso_expiry:3518 datetime_expiry = datetime.fromisoformat(iso_expiry) # UTC time3519 difference_in_minutes = (3520 datetime_expiry - datetime.utcnow()3521 ).total_seconds() / 603522 return difference_in_minutes <= 035233524 return True352535263527---