Vulnerability History

Date	High Risk	Low Risk
2025-05-29	1	1
Audit Report Details

3527
Lines of Code
Open
Resolved
🚨 High Risk Vulnerabilities

⚠️ Low Risk Vulnerabilities

Vulnerable Code:

1# Repo Tree (Python files only, excluding .gitignored files)
2
3├── __init__.py
4├── backend
5│   ├── __init__.py
6│   ├── alembic
7│   │   ├── env.py
8│   │   └── versions
9│   │       ├── 6240656d52f6_add_job_title_column.py
10│   │       ├── b240c664ed46_change_user_email_id_to_varchar.py
11│   │       └── c256d0279ea6_rename_user_email_table_to_plural.py
12│   ├── config.py
13│   ├── constants.py
14│   ├── database.py
15│   ├── db
16│   │   ├── companies.py
17│   │   ├── company_jobs.py
18│   │   ├── job_status.py
19│   │   ├── job_titles.py
20│   │   ├── processing_tasks.py
21│   │   ├── user_emails.py
22│   │   ├── user_job_status.py
23│   │   ├── user_jobs.py
24│   │   ├── user_session.py
25│   │   ├── users.py
26│   │   └── utils
27│   │       ├── user_email_utils.py
28│   │       └── user_utils.py
29│   ├── email_query_filters
30│   ├── main.py
31│   ├── routes
32│   │   ├── auth_routes.py
33│   │   ├── email_routes.py
34│   │   ├── file_routes.py
35│   │   ├── start_date_routes.py
36│   │   └── users_routes.py
37│   ├── session
38│   │   └── session_layer.py
39│   ├── start_date
40│   │   └── storage.py
41│   ├── static
42│   ├── templates
43│   ├── tests
44│   │   ├── __init__.py
45│   │   ├── conftest.py
46│   │   ├── routes
47│   │   │   ├── __init__.py
48│   │   │   ├── conftest.py
49│   │   │   └── test_email_routes.py
50│   │   ├── test_config_utils.py
51│   │   ├── test_constants.py
52│   │   ├── test_email_utils.py
53│   │   ├── test_filter_schema.py
54│   │   └── test_filter_utils.py
55│   └── utils
56│       ├── auth_utils.py
57│       ├── config_utils.py
58│       ├── cookie_utils.py
59│       ├── email_utils.py
60│       ├── file_utils.py
61│       ├── filter_utils.py
62│       └── llm_utils.py
63├── docs
64│   └── use_cases
65├── frontend
66│   ├── app
67│   │   ├── api
68│   │   │   └── subscribe
69│   │   ├── dashboard
70│   │   ├── errors
71│   │   ├── logout
72│   │   ├── preview
73│   │   │   ├── dashboard
74│   │   │   └── processing
75│   │   ├── processing
76│   ├── components
77│   ├── config
78│   ├── public
79│   ├── styles
80│   ├── tests
81│   ├── types
82│   └── utils
83
84
85# Complete repo contents (files-to-prompt output)
86
87target_repo/__init__.py
88---
89
90
91---
92target_repo/backend/__init__.py
93---
94
95
96---
97target_repo/backend/config.py
98---
99import json
100
101from pydantic import field_validator
102from pydantic_settings import BaseSettings, SettingsConfigDict, NoDecode
103from typing import List
104from typing_extensions import Annotated
105import logging
106
107logger = logging.getLogger(__name__)
108
109
110class Settings(BaseSettings):
111    GOOGLE_SCOPES: Annotated[List[str], NoDecode]
112    REDIRECT_URI: str
113    GOOGLE_CLIENT_ID: str
114    GOOGLE_API_KEY: str
115    COOKIE_SECRET: str
116    CLIENT_SECRETS_FILE: str = "credentials.json"
117    ENV: str = "dev"
118    APP_URL: str
119    ORIGIN: str = ".jobba.help"
120    DATABASE_URL: str = "default-for-local"
121    DATABASE_URL_LOCAL_VIRTUAL_ENV: str = (
122        "postgresql://postgres:postgres@localhost:5433/jobseeker_analytics"
123    )
124    DATABASE_URL_DOCKER: str = (
125        "postgresql://postgres:postgres@db:5432/jobseeker_analytics"
126    )
127
128    @field_validator("GOOGLE_SCOPES", mode="before")
129    @classmethod
130    def decode_scopes(cls, v: str) -> List[str]:
131        logger.info("Decoded scopes from string: %s", json.loads(v.strip("'\"")))
132        return json.loads(v.strip("'\""))
133
134    @property
135    def is_publicly_deployed(self) -> bool:
136        return self.ENV in ["prod", "staging"]
137
138    model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="allow")
139
140
141settings = Settings(_env_file=".env", _env_file_encoding="utf-8")
142
143
144---
145target_repo/backend/constants.py
146---
147"""
148This file contains the main constants used in the application.
149"""
150
151from datetime import datetime, timedelta
152from pathlib import Path
153from utils.filter_utils import (
154    parse_base_filter_config,
155)  # , parse_override_filter_config
156
157
158GENERIC_ATS_DOMAINS = [
159    "us.greenhouse-mail.io",
160    "smartrecruiters.com",
161    "linkedin.com",
162    "ashbyhq.com",
163    "hire.lever.co",
164    "hi.wellfound.com",
165    "talent.icims.com",
166    "myworkday.com",
167    "otta.com",
168]
169
170DEFAULT_DAYS_AGO = 30
171# Get the current date
172current_date = datetime.now()
173
174# Subtract 30 days
175date_days_ago = current_date - timedelta(days=DEFAULT_DAYS_AGO)
176
177# Format the date in the required format (YYYY/MM/DD)
178formatted_date = date_days_ago.strftime("%Y/%m/%d")
179
180APPLIED_FILTER_PATH = (
181    Path(__file__).parent / "email_query_filters" / "applied_email_filter.yaml"
182)
183APPLIED_FILTER_OVERRIDES_PATH = (
184    Path(__file__).parent
185    / "email_query_filters"
186    / "applied_email_filter_overrides.yaml"
187)
188QUERY_APPLIED_EMAIL_FILTER = (
189    f"after:{formatted_date} AND ({parse_base_filter_config(APPLIED_FILTER_PATH)})"
190)
191
192# ------ implement override filter later!! #
193# OR \n"
194# f"{parse_override_filter_config(APPLIED_FILTER_OVERRIDES_PATH)})"
195# )
196# label:jobs -label:query4
197
198---
199target_repo/backend/database.py
200---
201import os
202from typing import Annotated
203from sqlmodel import SQLModel, create_engine, Session
204from utils.config_utils import get_settings
205from sqlalchemy.ext.declarative import declarative_base
206from sqlalchemy.orm import sessionmaker
207import fastapi
208
209
210def create_db_and_tables():
211    SQLModel.metadata.create_all(engine)
212
213def get_session():
214    return Session(engine)
215
216
217def request_session():
218    session = get_session()
219
220    with session.begin():
221        yield session
222
223
224DBSession = Annotated[Session, fastapi.Depends(request_session)]
225
226settings = get_settings()
227IS_DOCKER_CONTAINER = os.environ.get("IS_DOCKER_CONTAINER", 0)
228if IS_DOCKER_CONTAINER:
229    DATABASE_URL = settings.DATABASE_URL_DOCKER
230elif settings.is_publicly_deployed:
231    DATABASE_URL = settings.DATABASE_URL
232else:
233    DATABASE_URL = settings.DATABASE_URL_LOCAL_VIRTUAL_ENV
234
235engine = create_engine(DATABASE_URL)
236SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
237Base = declarative_base()
238
239---
240target_repo/backend/main.py
241---
242import logging
243
244from fastapi import FastAPI, HTTPException, Request, Depends
245from fastapi.responses import HTMLResponse 
246from fastapi.staticfiles import StaticFiles
247from fastapi.templating import Jinja2Templates
248from starlette.middleware.sessions import SessionMiddleware
249from fastapi.middleware.cors import CORSMiddleware
250from slowapi import Limiter
251from slowapi.util import get_remote_address
252from slowapi.errors import RateLimitExceeded
253from slowapi.middleware import SlowAPIMiddleware
254from db.users import UserData
255from db.utils.user_utils import add_user
256from utils.config_utils import get_settings
257from session.session_layer import validate_session
258from contextlib import asynccontextmanager
259from database import create_db_and_tables
260
261# Import routes
262from routes import email_routes, auth_routes, file_routes, users_routes, start_date_routes
263
264@asynccontextmanager
265async def lifespan(app: FastAPI):
266    create_db_and_tables()
267    yield
268
269app = FastAPI(lifespan=lifespan)
270settings = get_settings()
271APP_URL = settings.APP_URL
272app.add_middleware(SessionMiddleware, secret_key=settings.COOKIE_SECRET)
273app.mount("/static", StaticFiles(directory="static"), name="static")
274
275# Register routes
276app.include_router(auth_routes.router)
277app.include_router(email_routes.router)
278app.include_router(file_routes.router)
279app.include_router(users_routes.router)
280app.include_router(start_date_routes.router)
281
282limiter = Limiter(key_func=get_remote_address)
283app.state.limiter = limiter  # Ensure limiter is assigned
284
285# Configure CORS
286if settings.is_publicly_deployed:
287    # Production CORS settings
288    origins = ["https://www.jobba.help", "https://www.staging.jobba.help", 
289    "https://www.app.justajobapp.com", "https://www.api.justajobapp.com"]
290else:
291    # Development CORS settings
292    origins = [
293        "http://localhost:3000",  # Assuming frontend runs on port 3000
294        "http://127.0.0.1:3000",
295    ]
296
297# Add SlowAPI middleware for rate limiting
298app.add_middleware(SlowAPIMiddleware)
299
300# Add CORS middleware
301app.add_middleware(
302    CORSMiddleware,
303    allow_origins=origins,  # Allow frontend origins
304    allow_credentials=True,
305    allow_methods=["*"],  # Allow all HTTP methods (GET, POST, etc.)
306    allow_headers=["*"],  # Allow all headers
307)
308
309app.add_middleware(
310    CORSMiddleware,
311    allow_origins=origins,  # Allow frontend origins
312    allow_credentials=True,
313    allow_methods=["*"],  # Allow all HTTP methods (GET, POST, etc.)
314    allow_headers=["*"],  # Allow all headers
315)
316
317# Set up Jinja2 templates
318templates = Jinja2Templates(directory="templates")
319
320logger = logging.getLogger(__name__)
321logging.basicConfig(level=logging.DEBUG, format="%(levelname)s - %(message)s")
322
323
324# Rate limit exception handler
325@app.exception_handler(RateLimitExceeded)
326async def rate_limit_exceeded_handler(request: Request, exc: RateLimitExceeded):
327    raise HTTPException(
328        status_code=429,
329        detail="Too many requests. Please try again later.",
330    )
331
332
333@app.post("/api/add-user")
334@limiter.limit("3/minute")
335async def add_user_endpoint(user_data: UserData, request: Request, user_id: str = Depends(validate_session)):
336    """
337    This endpoint adds a user to the database and session storage
338    """
339    try:
340        add_user(user_data, request)
341        return {"message": "User added successfully"}
342    except Exception as e:
343        # Log the error for debugging purposes
344        logger.error(f"An error occurred while adding user: {e}")
345        return {"error": "An error occurred while adding the user."}
346
347
348@app.get("/")
349async def root(request: Request, response_class=HTMLResponse):
350    return templates.TemplateResponse("homepage.html", {"request": request})
351
352# Run the app using Uvicorn
353if __name__ == "__main__":
354    import uvicorn
355
356    uvicorn.run(app, host="0.0.0.0", port=8000)
357
358---
359target_repo/backend/start_date/storage.py
360---
361"""
362This file contains the main constants used in the application.
363"""
364from pathlib import Path
365from utils.filter_utils import (
366    parse_base_filter_config,
367)
368from constants import QUERY_APPLIED_EMAIL_FILTER
369
370APPLIED_FILTER_PATH = (
371    Path(__file__).parent.parent / "email_query_filters" / "applied_email_filter.yaml"
372)
373
374def get_start_date_email_filter(start_date: str) -> str:
375    if not start_date:
376        return QUERY_APPLIED_EMAIL_FILTER
377
378    START_DATE_EMAIL_FILTER = (
379        f"after:{start_date} AND ({parse_base_filter_config(APPLIED_FILTER_PATH)})"
380    )
381    return START_DATE_EMAIL_FILTER
382
383---
384target_repo/backend/tests/__init__.py
385---
386
387
388---
389target_repo/backend/tests/conftest.py
390---
391import sys
392import os
393
394import pytest
395from testcontainers.postgres import PostgresContainer
396import sqlalchemy as sa
397from sqlalchemy.orm import Session
398from sqlmodel import SQLModel
399
400# Add the parent directory to sys.path
401sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
402os.chdir("./backend")
403
404import database  # noqa: E402
405
406
407@pytest.fixture(scope="session")
408def postgres_container():
409    with PostgresContainer("postgres:13") as postgres:
410        yield postgres
411
412
413@pytest.fixture
414def engine(postgres_container: PostgresContainer, monkeypatch):
415    test_engine = sa.create_engine(
416        sa.URL.create(
417            "postgresql",
418            username=postgres_container.username,
419            password=postgres_container.password,
420            host=postgres_container.get_container_host_ip(),
421            port=postgres_container.get_exposed_port(postgres_container.port),
422            database=postgres_container.dbname,
423        )
424    )
425
426    monkeypatch.setattr(database, "engine", test_engine)
427
428    database.create_db_and_tables()
429
430    yield test_engine
431
432    with test_engine.begin() as transaction:
433        transaction.execute(
434            sa.text("SET session_replication_role = :role"), {"role": "replica"}
435        )
436        for table in SQLModel.metadata.tables.values():
437            transaction.execute(table.delete())
438
439
440@pytest.fixture
441def db_session(engine, monkeypatch):
442    with Session(database.engine) as session:
443        yield session
444
445
446---
447target_repo/backend/tests/test_config_utils.py
448---
449from unittest.mock import patch
450from utils.config_utils import get_settings
451from config import Settings
452import pytest
453import json
454import os
455
456
457@pytest.fixture(scope="session", autouse=True)
458def setup_static_directory():
459    static_dir = os.path.join(os.path.dirname(__file__), "../static")
460    if not os.path.exists(static_dir):
461        os.makedirs(static_dir)
462
463
464@patch("utils.config_utils.config.Settings")
465def test_get_settings_only_called_once_with_lru(mock_settings_call):
466    get_settings.cache_clear()
467    get_settings()
468    get_settings()
469    # Ensure the Settings constructor is called only once due to lru_cache
470    mock_settings_call.assert_called_once()
471    get_settings.cache_clear()
472
473
474def test_import_settings_does_not_raise_error():
475    import backend.utils.llm_utils  # noqa: F401
476    import backend.utils.auth_utils  # noqa: F401
477
478
479def test_decode_scopes_valid_json():
480    input_str = '["https://www.googleapis.com/auth/gmail.readonly", "openid"]'
481    expected_output = ["https://www.googleapis.com/auth/gmail.readonly", "openid"]
482    assert Settings.decode_scopes(input_str) == expected_output
483
484
485def test_decode_scopes_with_extra_quotes():
486    input_str = '\'["https://www.googleapis.com/auth/gmail.readonly", "openid"]\''
487    expected_output = ["https://www.googleapis.com/auth/gmail.readonly", "openid"]
488    assert Settings.decode_scopes(input_str) == expected_output
489
490
491def test_decode_scopes_invalid_json():
492    input_str = '["https://www.googleapis.com/auth/gmail.readonly", "openid"'
493    with pytest.raises(json.JSONDecodeError):
494        Settings.decode_scopes(input_str)
495
496
497def test_decode_scopes_empty_string():
498    input_str = ""
499    with pytest.raises(json.JSONDecodeError):
500        Settings.decode_scopes(input_str)
501
502
503def test_prod_is_publicly_deployed_true():
504    settings = Settings(ENV="prod")
505    assert settings.is_publicly_deployed
506
507
508def test_dev_is_publicly_deployed_false():
509    settings = Settings(ENV="dev")
510    assert not settings.is_publicly_deployed
511
512
513def test_staging_is_publicly_deployed_true():
514    settings = Settings(ENV="staging")
515    assert settings.is_publicly_deployed
516
517
518---
519target_repo/backend/tests/test_constants.py
520---
521from pathlib import Path
522
523SUBJECT_LINE = "Invitation from an unknown sender: Interview with TestCompanyName @ Thu May 2, 2024 11:00am - 12pm (PDT) ([email protected])"
524SAMPLE_MESSAGE = {
525    "id": "abc123",
526    "threadId": "abc123",
527    "labelIds": ["IMPORTANT", "CATEGORY_PERSONAL", "Label_1"],
528    "snippet": "Interview with TestCompanyName Unknown sender This event from [email protected] won&#39;t appear in your calendar unless you say you know the sender. Know this sender? When Thursday May 9, 2024 ⋅ 02:40pm –",
529    "payload": {
530        "partId": "",
531        "mimeType": "multipart/mixed",
532        "filename": "",
533        "headers": [
534            {"name": "Delivered-To", "value": "[email protected]"},
535            {
536                "name": "Received",
537                "value": "by 2024:abc:6000:2000:b0:200:1000:5000 with SMTP id cub;        Thu, 2 May 2024 16:45:00 -0700 (PDT)",
538            },
539            {
540                "name": "X-Received",
541                "value": "by 2024:abc:6000:2000:b0:200:1000:5000 with SMTP id def567-890jkl.9.000000000000;        Thu, 2 May 2024 16:45:00 -0700 (PDT)",
542            },
543            {
544                "name": "ARC-Seal",
545                "value": "redacted-ARC-value",
546            },
547            {
548                "name": "ARC-Message-Signature",
549                "value": "i=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com; s=arc-00000000;        h=to:from:subject:date:message-id:sender:reply-to:mime-version         :dkim-signature:dkim-signature;        bh=pqr123;        fh=AZ123/PST=;        b=GAH",
550            },
551            {
552                "name": "ARC-Authentication-Results",
553                "value": "i=1; mx.google.com;       dkim=pass [email protected] header.s=10101101 header.b=WOOHOO;       dkim=pass [email protected] header.s=google header.b=di8r;       spf=pass (google.com: domain of [email protected] designates 000.00.000.00 as permitted sender) [email protected];       dmarc=pass (p=NONE sp=NONE dis=NONE) header.from=testcompanyname.com",
554            },
555            {"name": "Return-Path", "value": "<[email protected]>"},
556            {
557                "name": "Received",
558                "value": "from mail-fff-a00.google.com (mail-fff-a00.google.com. [000.00.000.00])        by mx.google.com with SMTPS id def567-890mno.0.2024.05.02.16.45.00        for <[email protected]>        (Google Transport Security);        Thu, 2 May 2024 16:45:00 -0700 (PDT)",
559            },
560            {
561                "name": "Received-SPF",
562                "value": "pass (google.com: domain of [email protected] designates 000.00.000.00 as permitted sender) client-ip=000.00.000.00;",
563            },
564            {
565                "name": "Authentication-Results",
566                "value": "mx.google.com;       dkim=pass [email protected] header.s=10101101 header.b=WOOHOO;       dkim=pass [email protected] header.s=google header.b=di8r;       spf=pass (google.com: domain of [email protected] designates 000.00.000.00 as permitted sender) [email protected];       dmarc=pass (p=NONE sp=NONE dis=NONE) header.from=testcompanyname.com",
567            },
568            {
569                "name": "DKIM-Signature",
570                "value": "v=1; a=rsa-sha256; c=relaxed/relaxed;        d=google.com; s=10101101; t=1111111111; x=1111111111; dara=google.com;        h=to:from:subject:date:message-id:sender:reply-to:mime-version:from         :to:cc:subject:date:message-id:reply-to;        bh=pqr123;        b=GAH",
571            },
572            {
573                "name": "DKIM-Signature",
574                "value": "v=1; a=rsa-sha256; c=relaxed/relaxed;        d=testcompanyname.com; s=google; t=1111111111; x=1111111111; dara=google.com;        h=to:from:subject:date:message-id:sender:reply-to:mime-version:from         :to:cc:subject:date:message-id:reply-to;        bh=pqr123;        b=GAH",
575            },
576            {
577                "name": "X-Google-DKIM-Signature",
578                "value": "v=1; a=rsa-sha256; c=relaxed/relaxed;        d=1e100.net; s=10101101; t=1111111111; x=1111111111;        h=to:from:subject:date:message-id:sender:reply-to:mime-version         :x-gm-message-state:from:to:cc:subject:date:message-id:reply-to;        bh=pqr123;        b=BLAH",
579            },
580            {
581                "name": "X-Gm-Message-State",
582                "value": "AGH",
583            },
584            {
585                "name": "X-Google-Smtp-Source",
586                "value": "AGH",
587            },
588            {"name": "MIME-Version", "value": "1.0"},
589            {
590                "name": "X-Received",
591                "value": "by 2222:abc:600:2000:d0:777:9000:4000 with SMTP id def567-890ghi.10.1111111111566; Thu, 2 May 2024 16:45:00 -0700 (PDT)",
592            },
593            {
594                "name": "Reply-To",
595                "value": "Recruiter Name <[email protected]>",
596            },
597            {
598                "name": "Sender",
599                "value": "Google Calendar <[email protected]>",
600            },
601            {
602                "name": "Message-ID",
603                "value": "<[email protected]>",
604            },
605            {"name": "Date", "value": "Thu, 2 May 2024 16:45:00 +0000"},
606            {
607                "name": "Subject",
608                "value": "Invitation from an unknown sender: Interview with TestCompanyName @ Thu May 2, 2024 11:00am - 12pm (PDT) ([email protected])",
609            },
610            {
611                "name": "From",
612                "value": "Recruiter Name <[email protected]>",
613            },
614            {"name": "To", "value": "[email protected]"},
615            {
616                "name": "Content-Type",
617                "value": 'multipart/mixed; boundary="000000000000"',
618            },
619        ],
620        "body": {"size": 0},
621        "parts": [
622            {
623                "partId": "0",
624                "mimeType": "multipart/alternative",
625                "filename": "",
626                "headers": [
627                    {
628                        "name": "Content-Type",
629                        "value": 'multipart/alternative; boundary="000000000000"',
630                    }
631                ],
632                "body": {"size": 0},
633                "parts": [
634                    {
635                        "partId": "0.0",
636                        "mimeType": "text/plain",
637                        "filename": "",
638                        "headers": [
639                            {
640                                "name": "Content-Type",
641                                "value": 'text/plain; charset="UTF-8"; format=flowed; delsp=yes',
642                            },
643                            {"name": "Content-Transfer-Encoding", "value": "base64"},
644                        ],
645                        "body": {
646                            "size": 2000,
647                            "data": "abc",
648                        },
649                    },
650                    {
651                        "partId": "0.1",
652                        "mimeType": "text/html",
653                        "filename": "",
654                        "headers": [
655                            {
656                                "name": "Content-Type",
657                                "value": 'text/html; charset="UTF-8"',
658                            },
659                            {
660                                "name": "Content-Transfer-Encoding",
661                                "value": "quoted-printable",
662                            },
663                        ],
664                        "body": {
665                            "size": 30000,
666                            "data": "abc",
667                        },
668                    },
669                    {
670                        "partId": "0.2",
671                        "mimeType": "text/calendar",
672                        "filename": "invite.ics",
673                        "headers": [
674                            {
675                                "name": "Content-Type",
676                                "value": 'text/calendar; charset="UTF-8"; method=REQUEST',
677                            },
678                            {"name": "Content-Transfer-Encoding", "value": "7bit"},
679                        ],
680                        "body": {
681                            "attachmentId": "",
682                            "size": 1000,
683                        },
684                    },
685                ],
686            },
687            {
688                "partId": "1",
689                "mimeType": "application/ics",
690                "filename": "invite.ics",
691                "headers": [
692                    {
693                        "name": "Content-Type",
694                        "value": 'application/ics; name="invite.ics"',
695                    },
696                    {
697                        "name": "Content-Disposition",
698                        "value": 'attachment; filename="invite.ics"',
699                    },
700                    {"name": "Content-Transfer-Encoding", "value": "base64"},
701                ],
702                "body": {
703                    "attachmentId": "",
704                    "size": 1000,
705                },
706            },
707        ],
708    },
709    "sizeEstimate": 33333,
710    "historyId": "22222222",
711    "internalDate": "1111111111000",
712}
713
714DESIRED_PASS_APPLIED_EMAIL_FILTER_SUBJECT = [
715    "Thank you for your Application!",
716    "Jobba, your application was sent to The Huts",
717    "Your Interview with",
718    "Thank you for your job application"
719]
720
721DESIRED_FAIL_APPLIED_EMAIL_FILTER_FROM = [
722    "[email protected]",  # made up, would be better to capture the real example
723    "[email protected]",
724    "[email protected]",
725    "[email protected]",
726    "[email protected]",
727    "[email protected]",
728    "[email protected]",
729    "[email protected]"
730]
731
732DESIRED_FAIL_APPLIED_EMAIL_FILTER_SUBJECT = [
733    "Apply to",
734    "Apply now",
735    "New job",
736    "Job Search Council Matching - Next Steps"
737]
738
739DESIRED_PASS_APPLIED_EMAIL_FILTER_FROM = ["[email protected]", "myworkday.com"]
740
741SAMPLE_FILTER_PATH = Path(__file__).parent / "sample_base_filter.yaml"
742EXPECTED_SAMPLE_QUERY_STRING = """(subject:"application has been submitted" 
743    OR (subject:"application to" AND subject:"successfully submitted") 
744    OR from:"[email protected]" 
745    AND -from:"[email protected]" 
746    AND -subject:"watering")"""
747
748
749---
750target_repo/backend/tests/test_email_utils.py
751---
752from unittest import mock
753import pytest
754
755from tests.test_constants import SAMPLE_MESSAGE, SUBJECT_LINE
756import utils.email_utils as email_utils
757import db.utils.user_email_utils as user_email_utils
758
759def test_get_top_consecutive_capitalized_words():
760    test_cases = {
761        (
762            ("Hello", 10),  # capitalized, highest frequency, prioritize
763            ("World", 8),  # capitalized, lower frequency, ignore
764        ): "Hello",
765        (
766            ("Hello", 10),  # capitalized, highest frequency, prioritize
767            ("World", 10),  # capitalized, highest frequency, add to result
768            ("How", 5),  # capitalized, lower frequency, ignore
769        ): "Hello World",
770        (
771            ("hello", 5),  # not capitalized, highest frequency, ignore
772            ("World", 5),  # capitalized, highest frequency, prioritize
773            ("How", 5),  # capitalized, highest frequency, add to result
774            ("are", 5),  # not capitalized, highest frequency, ignore
775        ): "World How",
776        (
777            ("hello", 5),  # not capitalized, highest frequency, ignore
778            ("world", 5),  # capitalized, highest frequency, prioritize
779            ("how", 5),  # capitalized, highest frequency, add to result
780            ("are", 5),  # not capitalized, highest frequency, ignore
781        ): "",  # no consecutive capitalized words
782    }
783    for word_list, expected_value in test_cases.items():
784        result = email_utils.get_top_consecutive_capitalized_words(word_list)
785        assert result == expected_value
786
787
788def test_is_valid_email():
789    email_test_cases = {
790        "[email protected]": True,
791        "[email protected]": False,  # Invalid domain
792        "no-reply.com": False,  # Missing @
793    }
794    for email, expected_value in email_test_cases.items():
795        is_valid = email_utils.is_valid_email(email)
796        assert is_valid == expected_value, "email: %s" % email
797
798
799def test_is_email_automated():
800    email_test_cases = {
801        "[email protected]": True,
802        "[email protected]": True,
803        "[email protected]": True,
804        "[email protected]": True,
805        "[email protected]": True,
806        "[email protected]": False,
807    }
808    for email, expected_value in email_test_cases.items():
809        is_automated = email_utils.is_automated_email(email)
810        assert is_automated == expected_value, "email: %s" % email
811
812
813def test_get_email_subject_line():
814    subject_line = email_utils.get_email_subject_line(SAMPLE_MESSAGE)
815    assert (
816        subject_line
817        == "Invitation from an unknown sender: Interview with TestCompanyName @ Thu May 2, 2024 11:00am - 12pm (PDT) ([email protected])"
818    )
819
820
821def test_get_email_from_address():
822    from_address = email_utils.get_email_from_address(SAMPLE_MESSAGE)
823    assert from_address == "[email protected]"
824
825
826def test_get_email_domain():
827    from_email_domain = email_utils.get_email_domain_from_address(
828        "[email protected]"
829    )
830    assert from_email_domain == "testcompanyname.com"
831
832
833def test_is_generic_email_domain():
834    assert email_utils.is_generic_email_domain("hire.lever.co")
835    assert email_utils.is_generic_email_domain("us.greenhouse-mail.io")
836
837
838def test_get_last_capitalized_words_in_line():
839    last_capitalized_words = email_utils.get_last_capitalized_words_in_line(
840        "Thank you for your application to CompanyName"
841    )
842    assert last_capitalized_words == "CompanyName"
843
844
845def test_get_company_name_returns_email_domain():
846    company_name = email_utils.get_company_name(
847        id="abc123", msg=SAMPLE_MESSAGE, subject_line=SUBJECT_LINE
848    )
849    assert company_name == "testcompanyname"
850
851
852def test_get_company_name_returns_top_word():
853    """Default behavior for company name is to return the
854    highest frequency word that appears in the email body."""
855    with mock.patch(
856        "utils.email_utils.get_top_word_in_email_body", return_value="FakeCompany"
857    ):
858        company_name = email_utils.get_company_name(
859            id="abc123", msg=SAMPLE_MESSAGE, subject_line=SUBJECT_LINE
860        )
861        assert company_name == "FakeCompany"
862
863
864def test_get_company_name_returns_last_capital_word_in_subject_line():
865    """Default behavior for company name is to return the
866    highest frequency word that appears in the email body."""
867    with (
868        mock.patch(
869            "utils.email_utils.get_top_word_in_email_body", return_value="interview"
870        ),
871        mock.patch(
872            "utils.email_utils.get_email_from_address",
873            return_value="[email protected]",
874        ),
875    ):
876        company_name = email_utils.get_company_name(
877            id="abc123",
878            msg=SAMPLE_MESSAGE,
879            subject_line="Thanks for interviewing with CoolCompany",
880        )
881        assert company_name == "CoolCompany"
882
883
884def test_get_email_received_at_timestamp():
885    received_at = email_utils.get_received_at_timestamp(1, SAMPLE_MESSAGE)
886    assert received_at == "Thu, 2 May 2024 16:45:00 +0000"
887
888
889@pytest.fixture
890def mock_user():
891    user = mock.MagicMock()
892    user.user_id = "test_user_123"
893    return user
894
895
896@pytest.fixture
897def message_data_with_list_values():
898    """Message data where received_at is a list instead of a string"""
899    return {
900        "id": "19501385930c533f",
901        "company_name": "",
902        "application_status": "",
903        "received_at": "Thu, 13 Feb 2025 21:30:24 +0000 (UTC)",
904        "subject": "Message replied: Are you looking for Remote opportunities?",
905        "job_title": "",
906        "from": "Tester Recruiter <[email protected]>"
907    }
908
909
910@mock.patch('db.utils.user_email_utils.check_email_exists')
911def test_create_user_email_with_list_values(mock_check_email, mock_user, message_data_with_list_values, caplog):
912    """Test that create_user_email handles message_data_with_list_values correctly"""
913    mock_check_email.return_value = False
914    result = user_email_utils.create_user_email(mock_user, message_data_with_list_values)
915    assert result is not None  # user email created successfully
916
917
918---
919target_repo/backend/tests/test_filter_schema.py
920---
921"""
922these tests are intended to verify that the changes made to filter yamls will yield the
923desired results. Note that these tests DO NOT make any checks against functions in
924filter_utils. If you make changes there, the correct tests are found in test_filter_utils.
925
926tests for override filters have not yet been implemented
927"""
928
929import pytest
930from pathlib import Path
931import yaml
932from typing import List, Dict, Union
933import re
934from constants import APPLIED_FILTER_PATH  # , APPLIED_FILTER_OVERRIDES_PATH
935from tests.test_constants import (
936    DESIRED_FAIL_APPLIED_EMAIL_FILTER_SUBJECT,
937    DESIRED_PASS_APPLIED_EMAIL_FILTER_FROM,
938    SAMPLE_FILTER_PATH,
939)
940
941FilterConfigType = List[Dict[str, Union[str, int, bool, list, dict]]]
942
943FILTER_CONFIG_DIR = Path(__file__).parent.parent / "email_query_filters"
944
945
946def get_base_filter_config_paths() -> List[Path]:
947    return [SAMPLE_FILTER_PATH] + [
948        x for x in FILTER_CONFIG_DIR.iterdir() if "override" not in str(x)
949    ]
950
951
952def get_override_filter_config_paths() -> List[Path]:
953    return [x for x in FILTER_CONFIG_DIR.iterdir() if "override" in str(x)]
954
955
956def load_filter_config(filter_path: str) -> FilterConfigType:
957    with open(filter_path, "r") as fid:
958        filter_config = yaml.safe_load(fid)
959        return filter_config
960
961
962def validate_schema_block_order(filter_config: FilterConfigType) -> bool:
963    """
964    Validates that 'exclude' blocks appear after 'include' blocks in the schema.
965    """
966
967    include_seen = False
968    for block in filter_config:
969        how = block.get("how")
970        if how == "include":
971            include_seen = True
972        elif how == "exclude" and not include_seen:
973            return False  # Exclude block before any include block
974
975    return True
976
977
978@pytest.mark.parametrize(
979    "filter_config", [load_filter_config(x) for x in get_base_filter_config_paths()]
980)
981def test_base_filter_yaml_schema(filter_config):
982    logic_list = [block["logic"] for block in filter_config if block["logic"]]
983    how_list = [block["how"] for block in filter_config]
984    exclude_terms = sum(
985        [block["terms"] for block in filter_config if block["how"] == "exclude"], []
986    )
987
988    assert all(
989        [
990            (x == "any" and y == "include") or (x == "all" and y == "exclude")
991            for x, y in zip(logic_list, how_list)
992        ]
993    ), "logic=any is not allowed for how=exclude"
994    assert all(["*" not in x for x in exclude_terms]), (
995        "wildcard is not allowed in exclude blocks"
996    )
997    assert validate_schema_block_order(filter_config), (
998        "Exclude block found before an include block"
999    )
1000
1001
1002def apply_base_filter(field_text, field_name, filter_config) -> bool:
1003    """Applies the YAML filter to the given text."""
1004
1005    ret_val = False  # Default to failing if no filter logic is defined.
1006
1007    for block in filter_config:
1008        if block["field"] == field_name:
1009            # check if the text is in the any, include block for that field
1010            if block["logic"] == "any" and block["how"] == "include":
1011                # simple compare
1012                if not ret_val:
1013                    ret_val = any(
1014                        [
1015                            x.lower() in field_text.lower()
1016                            for x in block["terms"]
1017                            if "*" not in x
1018                        ]
1019                    )
1020
1021                # use regex for wildcard compare
1022                if not ret_val:
1023                    ret_val = any(
1024                        [
1025                            re.findall(
1026                                x.replace(" * ", ".*").lower(), field_text.lower()
1027                            )
1028                            for x in block["terms"]
1029                            if "*" in x
1030                        ]
1031                    )
1032
1033            # check if the text is in the all, exclude block for that field.
1034            # all, exclude logic will override any matching includes
1035            if ret_val:
1036                if block["logic"] == "all" and block["how"] == "exclude":
1037                    ret_val = all(
1038                        [x.lower() not in field_text.lower() for x in block["terms"]]
1039                    )
1040
1041    return ret_val
1042
1043
1044@pytest.mark.parametrize(
1045    "test_constant,filter_config",
1046    [(DESIRED_FAIL_APPLIED_EMAIL_FILTER_SUBJECT, APPLIED_FILTER_PATH)],
1047)
1048def test_apply_email_filter_subject_fail(test_constant, filter_config):
1049    """
1050    Tests if the desired subject pairs in test_constants will fail the filter
1051    """
1052    filter_config = load_filter_config(APPLIED_FILTER_PATH)
1053
1054    result_list = []
1055    for subject_text in test_constant:
1056        result = apply_base_filter(subject_text, "subject", filter_config)
1057        result_list.append(result)
1058
1059    assert not any(result_list), (
1060        f"These subject pairs failed to fail: {[x for x, y in list(zip(test_constant, result_list)) if y]}"
1061    )
1062
1063
1064@pytest.mark.parametrize(
1065    "test_constant,filter_config",
1066    [(DESIRED_PASS_APPLIED_EMAIL_FILTER_FROM, APPLIED_FILTER_PATH)],
1067)
1068def test_apply_email_filter_from_pass(test_constant, filter_config):
1069    """
1070    Tests if the desired from pairs in test_constants will pass the filter
1071    """
1072    filter_config = load_filter_config(APPLIED_FILTER_PATH)
1073
1074    result_list = []
1075    for from_text in test_constant:
1076        result = apply_base_filter(from_text, "from", filter_config)
1077        result_list.append(result)
1078
1079    assert all(result_list), (
1080        f"These from pairs failed to pass: {[x for x, y in list(zip(test_constant, result_list)) if not y]}"
1081    )
1082
1083
1084@pytest.mark.parametrize(
1085    "test_constant,filter_config",
1086    [(DESIRED_FAIL_APPLIED_EMAIL_FILTER_SUBJECT, APPLIED_FILTER_PATH)],
1087)
1088def test_apply_email_filter_from_fail(test_constant, filter_config):
1089    """
1090    Tests if the desired from pairs in test_constants will fail the filter
1091    """
1092    filter_config = load_filter_config(APPLIED_FILTER_PATH)
1093
1094    result_list = []
1095    for from_text in test_constant:
1096        result = apply_base_filter(from_text, "from", filter_config)
1097        result_list.append(result)
1098
1099    assert not any(result_list), (
1100        f"These from pairs failed to fail: {[x for x, y in list(zip(test_constant, result_list)) if y]}"
1101    )
1102
1103
1104---
1105target_repo/backend/tests/test_filter_utils.py
1106---
1107"""
1108test that the strings produced by filter utils match expectations
1109
1110tests for override filters have not yet been implemented.
1111"""
1112
1113from typing import List, Dict, Union
1114
1115from utils.filter_utils import (
1116    parse_base_filter_config,
1117)  # , parse_override_filter_config
1118from tests.test_constants import SAMPLE_FILTER_PATH, EXPECTED_SAMPLE_QUERY_STRING
1119
1120FilterConfigType = List[Dict[str, Union[str, int, bool, list, dict]]]
1121
1122
1123def test_parse_filter_config_against_sample_filter(
1124    filter_path=SAMPLE_FILTER_PATH, expected_query_string=EXPECTED_SAMPLE_QUERY_STRING
1125):
1126    result_str = parse_base_filter_config(filter_path)
1127
1128    # remove white space from expected string for the purpose of comparing
1129    expected_query_string = (
1130        expected_query_string.replace("\n", "").replace("\t", "").replace("    ", "")
1131    )
1132
1133    assert result_str == expected_query_string, (
1134        "result query string doesn't match expected query string"
1135    )
1136
1137
1138---
1139target_repo/backend/tests/routes/__init__.py
1140---
1141
1142
1143---
1144target_repo/backend/tests/routes/conftest.py
1145---
1146from datetime import datetime, timedelta
1147from unittest import mock
1148
1149import pytest
1150from fastapi.testclient import TestClient
1151
1152from db.users import Users
1153import database
1154import main
1155
1156
1157@pytest.fixture
1158def client(db_session):
1159    main.app.dependency_overrides[database.request_session] = lambda: db_session
1160    test_client = TestClient(main.app)
1161
1162    return test_client
1163
1164
1165@pytest.fixture
1166def logged_in_user(db_session, client):
1167    # create user
1168    user = Users(
1169        user_id="123",
1170        user_email="[email protected]",
1171        start_date=datetime(2000, 1, 1),
1172    )
1173    db_session.add(user)
1174    db_session.flush()
1175
1176    # log in
1177    mock_credentials = mock.Mock(
1178        **{
1179            "expiry": datetime.utcnow() + timedelta(seconds=10),
1180            "token": "fake access token",
1181            "to_json.return_value": {"foo": "bar"},
1182        }
1183    )
1184    mock_decoded_token = {"sub": user.user_id, "email": user.user_email}
1185    with (
1186        mock.patch(
1187            "routes.auth_routes.Flow",
1188            **{"from_client_secrets_file.return_value.credentials": mock_credentials},
1189        ),
1190        mock.patch(
1191            "utils.auth_utils.id_token",
1192            **{"verify_oauth2_token.return_value": mock_decoded_token},
1193        ),
1194    ):
1195        auth_resp = client.get("/login", params={"code": "abc"}, follow_redirects=False)
1196        assert auth_resp.status_code == 303
1197        assert auth_resp.headers["Location"] == "http://localhost:3000/dashboard"
1198
1199    return user
1200
1201
1202---
1203target_repo/backend/tests/routes/test_email_routes.py
1204---
1205from utils import auth_utils
1206from unittest import mock
1207from datetime import datetime
1208
1209from fastapi import Request
1210from sqlalchemy.orm import Session
1211from google.oauth2.credentials import Credentials
1212
1213from db.users import Users
1214from db.processing_tasks import TaskRuns, FINISHED, STARTED
1215from routes.email_routes import fetch_emails_to_db
1216
1217
1218def test_processing(db_session, client, logged_in_user):
1219    db_session.add(TaskRuns(user=logged_in_user, status=STARTED))
1220    db_session.flush()
1221
1222    # make request to check on processing status
1223    resp = client.get("/processing", follow_redirects=False)
1224
1225    # assert response
1226    assert resp.status_code == 200, resp.headers
1227    assert resp.json()["processed_emails"] == 0
1228
1229
1230def test_processing_404(db_session, client, logged_in_user):
1231    resp = client.get("/processing", follow_redirects=False)
1232    assert resp.status_code == 404
1233
1234
1235def test_fetch_emails_to_db(db_session: Session):
1236    test_user_id = "123"
1237
1238    db_session.add(
1239        Users(
1240            user_id=test_user_id,
1241            user_email="[email protected]",
1242            start_date=datetime(2000, 1, 1),
1243        )
1244    )
1245    db_session.commit()
1246
1247    with mock.patch("routes.email_routes.get_email_ids"):
1248        fetch_emails_to_db(
1249            auth_utils.AuthenticatedUser(Credentials("abc")),
1250            Request({"type": "http", "session": {}}),
1251            user_id=test_user_id,
1252        )
1253
1254    task_run = db_session.get(TaskRuns, test_user_id)
1255    assert task_run.status == FINISHED
1256
1257
1258def test_fetch_emails_to_db_in_progress_rate_limited_no_processing(db_session: Session):
1259    test_user_id = "123"
1260
1261    user = Users(
1262        user_id=test_user_id,
1263        user_email="[email protected]",
1264        start_date=datetime(2000, 1, 1),
1265    )
1266    db_session.add(user)
1267    db_session.add(TaskRuns(user=user, status=STARTED))
1268    db_session.commit()
1269
1270    with mock.patch("routes.email_routes.get_email_ids") as mock_get_email_ids:
1271        fetch_emails_to_db(
1272            auth_utils.AuthenticatedUser(Credentials("abc")),
1273            Request({"type": "http", "session": {}}),
1274            user_id=test_user_id,
1275        )
1276
1277    mock_get_email_ids.assert_not_called()
1278    task_run = db_session.get(TaskRuns, test_user_id)
1279    assert task_run.status == STARTED
1280
1281
1282---
1283target_repo/backend/utils/auth_utils.py
1284---
1285import logging
1286import uuid
1287
1288from utils.file_utils import get_user_filepath
1289
1290from google.oauth2.credentials import Credentials
1291from google.auth.transport.requests import Request
1292from google.oauth2 import id_token
1293
1294from utils.config_utils import get_settings
1295
1296logger = logging.getLogger(__name__)
1297
1298settings = get_settings()
1299
1300
1301class AuthenticatedUser:
1302    """
1303    The AuthenticatedUser class is used to
1304    store information about the user. This
1305    class is instantiated after the user has
1306    successfully authenticated with Google.
1307    """
1308
1309    def __init__(self, creds: Credentials, start_date=None):
1310        self.creds = creds
1311        self.user_id, self.user_email = self.get_user_id_and_email()
1312        self.filepath = get_user_filepath(self.user_id)
1313        self.start_date = start_date
1314
1315    def get_user_id_and_email(self) -> tuple:
1316        """
1317        Retrieves the user ID and email from Google OAuth2 credentials.
1318
1319        Parameters:
1320
1321        Returns:
1322        - user_id: The unique user ID.
1323        - email: The user's email address.
1324        """
1325        try:
1326            logger.info("Verifying ID token...")
1327
1328            # Ensure we have an ID token
1329            if not self.creds.id_token:
1330                logger.warning("ID token is missing, trying to refresh credentials...")
1331                self.creds.refresh(Request())  # Refresh credentials
1332
1333            # If still missing, raise an error
1334            if not self.creds.id_token:
1335                raise ValueError("No ID token available after refresh.")
1336    
1337            decoded_token = id_token.verify_oauth2_token(
1338                self.creds.id_token, Request(), audience=settings.GOOGLE_CLIENT_ID
1339            )
1340            user_id = decoded_token["sub"]  # 'sub' is the unique user ID
1341            user_email = decoded_token.get("email")  # 'email' is the user's email address
1342            return user_id, user_email
1343        
1344        except (KeyError, TypeError):
1345            self.creds = self.creds.refresh(Request())
1346            if not self.creds.id_token:
1347                proxy_user_id = str(uuid.uuid4())
1348                logger.error(
1349                    "Could not retrieve user ID. Using proxy ID: %s", proxy_user_id
1350                )
1351                return proxy_user_id, None  # Generate a random ID and return None for email
1352            if not hasattr(self, "_retry"):
1353                self._retry = True
1354                return self.get_user_id_and_email()
1355            else:
1356                proxy_user_id = str(uuid.uuid4())
1357                logger.error(
1358                    "Could not retrieve user ID after retry. Using proxy ID: %s",
1359                    proxy_user_id,
1360                )
1361                return proxy_user_id, None  # Generate a random ID and return None for email
1362        except Exception as e:
1363            logger.error("Error verifying ID token: %s", e)
1364            proxy_user_id = str(uuid.uuid4())
1365            logger.error("Could not verify ID token. Using proxy ID: %s", proxy_user_id)
1366            return proxy_user_id, None  # Generate a random ID and return None for email
1367
1368
1369---
1370target_repo/backend/utils/config_utils.py
1371---
1372from functools import lru_cache
1373import config
1374
1375
1376@lru_cache
1377def get_settings():
1378    return config.Settings()
1379
1380
1381---
1382target_repo/backend/utils/cookie_utils.py
1383---
1384from fastapi import Response
1385from utils.config_utils import get_settings
1386
1387settings = get_settings()
1388
1389
1390def set_conditional_cookie(
1391    response: Response,
1392    key: str,
1393    value: str,
1394    max_age: int = 3600,  # 1 hour
1395    path: str = "/",
1396    httponly: bool = True,
1397):
1398    """Helper function to set cookies with environment-appropriate settings"""
1399    cookie_params = {
1400        "key": key,
1401        "value": value,
1402        "max_age": max_age,
1403        "path": path,
1404        "httponly": httponly,
1405    }
1406
1407    # Add environment-specific parameters
1408    if settings.is_publicly_deployed:
1409        cookie_params.update(
1410            {"domain": settings.ORIGIN, "secure": True, "samesite": "Strict"}
1411        )
1412    else:
1413        cookie_params.update({"secure": False, "samesite": "Lax"})
1414
1415    # Apply cookie prefixes for additional security
1416    if cookie_params["secure"]:
1417        if cookie_params["path"] == "/" and "domain" not in cookie_params:
1418            cookie_params["key"] = f"__Host-{cookie_params['key']}"
1419        else:
1420            cookie_params["key"] = f"__Secure-{cookie_params['key']}"
1421
1422    response.set_cookie(**cookie_params)
1423    return response
1424
1425
1426---
1427target_repo/backend/utils/email_utils.py
1428---
1429import base64
1430import email
1431import logging
1432import re
1433from typing import Dict, Any
1434
1435from bs4 import BeautifulSoup
1436from email_validator import validate_email, EmailNotValidError
1437
1438from constants import GENERIC_ATS_DOMAINS
1439
1440logger = logging.getLogger(__name__)
1441
1442
1443def clean_whitespace(text: str) -> str:
1444    """
1445    remove \n, \r, and \t from strings
1446    """
1447    return text.replace("\n", "").replace("\r", "").replace("\t", "")
1448
1449
1450def is_automated_email(email: str) -> bool:
1451    """
1452    Determines if an email address is automated or from a person.
1453
1454    Parameters:
1455    email (str): The email address to classify.
1456
1457    Returns:
1458    bool: True if automated, False otherwise.
1459    """
1460    # Define patterns for common automated prefixes and domains
1461    automated_patterns = [
1462        r"^no[-_.]?reply@",  # Matches "no-reply", "no_reply", "noreply"
1463        r"^do[-_.]?not[-_.]?reply@",  # Matches "do-not-reply", "do_not_reply"
1464        r"^notifications@",  # Matches "notifications@"
1465        r"^team@",  # Matches "team@"
1466        r"^hello@",  # Matches "hello@" (often automated)
1467        r"@smartrecruiters\.com$",  # Matches specific automated domains
1468    ]
1469
1470    # Check against the patterns
1471    for pattern in automated_patterns:
1472        if re.search(pattern, email, re.IGNORECASE):
1473            return True  # It's an automated email
1474
1475    return False  # It's likely from a person
1476
1477
1478def is_valid_email(email: str) -> bool:
1479    try:
1480        validate_email(email)
1481        return True
1482    except EmailNotValidError as e:
1483        # email is not valid, exception message is human-readable
1484        print(str(e))
1485        return False
1486
1487
1488def get_email_content(email_data: Dict[str, Any]) -> str:
1489    """
1490    parses html content of email data and appends it to text content and subject conent
1491
1492    Note 1: linkedIn easy apply messages have *different* html and text_content, so we need to keep both
1493    Note 2: some automated emails only contain the information about the company in the subject and
1494        not the email body, so we need to append this to make sure the email processor gets to see it.
1495
1496    """
1497    text_content = email_data["subject"]
1498
1499    if email_data["text_content"]:
1500        text_content += "\n"
1501        text_content += email_data["text_content"]
1502
1503    if email_data["html_content"]:
1504        soup = BeautifulSoup(email_data["html_content"], "html.parser")
1505        html_content = soup.get_text(separator=" ", strip=True)
1506
1507        text_content += "\n"
1508        text_content += html_content
1509
1510    return text_content
1511
1512
1513def get_email(message_id: str, gmail_instance=None):
1514    if gmail_instance:
1515        try:
1516            message = (
1517                gmail_instance.users()
1518                .messages()
1519                .get(userId="me", id=message_id, format="raw")
1520                .execute()
1521            )
1522            msg_str = base64.urlsafe_b64decode(message["raw"].encode("ASCII")).decode(
1523                "utf-8"
1524            )
1525            mime_msg = email.message_from_string(msg_str)
1526            # logger.info("mime_msg: %s", mime_msg)
1527            # logger.info("msg_str: %s", msg_str)
1528            email_data = {
1529                "id": message_id,
1530                "threadId": message.get("threadId", None),
1531                "from": None,
1532                "to": None,
1533                "subject": None,
1534                "date": None,
1535                "text_content": None,
1536                "html_content": None,
1537            }
1538
1539            # Getting email headers
1540            email_data["from"] = clean_whitespace(mime_msg.get("From"))
1541            email_data["to"] = clean_whitespace(mime_msg.get("To"))
1542            email_data["subject"] = clean_whitespace(mime_msg.get("Subject"))
1543            email_data["date"] = mime_msg.get("Date")
1544
1545            # Extract body of the email
1546            if mime_msg.is_multipart():
1547                for part in mime_msg.walk():
1548                    content_type = part.get_content_type()
1549                    content_disposition = str(part.get("Content-Disposition"))
1550                    if (
1551                        content_type == "text/plain"
1552                        and "attachment" not in content_disposition
1553                    ):
1554                        email_data["text_content"] = part.get_payload(
1555                            decode=True
1556                        ).decode(encoding="utf-8", errors="ignore")
1557                    elif (
1558                        content_type == "text/html"
1559                        and "attachment" not in content_disposition
1560                    ):
1561                        email_data["html_content"] = part.get_payload(
1562                            decode=True
1563                        ).decode(encoding="utf-8", errors="ignore")
1564            else:
1565                content_type = mime_msg.get_content_type()
1566                if content_type == "text/plain":
1567                    email_data["text_content"] = mime_msg.get_payload(
1568                        decode=True
1569                    ).decode(encoding="utf-8", errors="ignore")
1570                elif content_type == "text/html":
1571                    email_data["html_content"] = mime_msg.get_payload(
1572                        decode=True
1573                    ).decode(encoding="utf-8", errors="ignore")
1574
1575            email_data["raw_text_content"] = email_data["text_content"]
1576            email_data["text_content"] = get_email_content(email_data)
1577
1578            return email_data
1579
1580        except Exception as e:
1581            logger.exception(f"Error retrieving email with id {message_id}: {e}")
1582            return {}
1583    return {}
1584
1585
1586def get_email_ids(query: tuple = None, gmail_instance=None):
1587    email_ids = []
1588    page_token = None
1589
1590    while True:
1591        response = (
1592            gmail_instance.users()
1593            .messages()
1594            .list(
1595                userId="me",
1596                q=query,
1597                includeSpamTrash=True,
1598                pageToken=page_token,
1599            )
1600            .execute()
1601        )
1602
1603        if "messages" in response:
1604            email_ids.extend(response["messages"])
1605
1606        page_token = response.get("nextPageToken")
1607        if not page_token:
1608            break
1609
1610    return email_ids
1611
1612
1613def get_email_payload(msg):
1614    return msg.get("payload", None)
1615
1616
1617def get_email_headers(msg):
1618    email_data = get_email_payload(msg)
1619    if email_data:
1620        return email_data.get("headers", None)
1621    return None
1622
1623
1624def get_email_parts(msg):
1625    email_data = get_email_payload(msg)
1626    if email_data:
1627        return email_data.get("parts", None)
1628    return None
1629
1630
1631def get_email_subject_line(msg):
1632    try:
1633        email_headers = get_email_headers(msg)
1634        if email_headers:
1635            for header in email_headers:
1636                key = header.get("name")
1637                if key == "Subject":
1638                    return header.get("value", "")
1639    except Exception as e:
1640        logger.error("Error getting email subject line: %s", e)
1641    return ""
1642
1643
1644def get_last_capitalized_words_in_line(line):
1645    try:
1646        words = line.split()
1647        last_capitalized_words = []
1648        for word in reversed(words):
1649            if word[0].isupper():
1650                last_capitalized_words.append(word)
1651            else:
1652                break
1653        return " ".join(reversed(last_capitalized_words))
1654    except Exception as e:
1655        logger.error("Error getting last capitalized words in email subject: %s", e)
1656    return ""
1657
1658
1659def get_email_from_address(msg):
1660    try:
1661        email_headers = get_email_headers(msg)
1662        if email_headers:
1663            for header in email_headers:
1664                if header.get("name") == "From":
1665                    # if value enclosed in <> then extract email address
1666                    # else return the value as is
1667                    from_address = header.get("value")
1668                    if "<" in from_address:
1669                        return from_address.split("<")[1].split(">")[0]
1670                    return from_address
1671    except Exception as e:
1672        logger.error("Error getting email from address: %s", e)
1673    return ""
1674
1675
1676def get_received_at_timestamp(message_id, msg):
1677    import datetime
1678
1679    try:
1680        email_headers = get_email_headers(msg)
1681        if email_headers:
1682            for header in email_headers:
1683                key = header.get("name")
1684                if key == "Date":
1685                    return header.get("value")
1686    except Exception as e:
1687        print("msg_%s: %s" % (message_id, e))
1688    return datetime.datetime.now()  # default if trouble parsing
1689
1690
1691def is_generic_email_domain(domain):
1692    # input expects return value of get_email_domain_from_address
1693    return domain in GENERIC_ATS_DOMAINS
1694
1695
1696def get_email_domain_from_address(email_address):
1697    return email_address.split("@")[1] if "@" in email_address else ""
1698
1699
1700def clean_email(email_body: str) -> list:
1701    import spacy
1702    from spacy_cleaner import processing, Cleaner
1703
1704    try:
1705        model = spacy.load("en_core_web_sm")
1706        pipeline = Cleaner(
1707            model,
1708            processing.remove_stopword_token,
1709            processing.remove_punctuation_token,
1710            processing.remove_number_token,
1711        )
1712        return pipeline.clean([email_body])
1713    except Exception as e:
1714        logger.error("Error cleaning email: %s", e)
1715    return []
1716
1717
1718def get_word_frequency(cleaned_email):
1719    try:
1720        word_dict = {}
1721        for word in cleaned_email[0].split(" "):
1722            if word not in word_dict:
1723                word_dict[word] = 1
1724            else:
1725                word_dict[word] += 1
1726
1727        word_dict_sorted = sorted(
1728            word_dict.items(), key=lambda item: item[1], reverse=True
1729        )
1730        return word_dict_sorted
1731    except Exception as e:
1732        logger.error("Error getting word frequency: %s", e)
1733    return []
1734
1735
1736def get_top_word_in_email_body(msg_id, msg):
1737    try:
1738        parts = get_email_parts(msg)
1739        if parts:
1740            for part in parts:
1741                if part.get("mimeType") not in [
1742                    "text/plain",
1743                    "text/html",
1744                ]:
1745                    continue
1746                if part.get("mimeType") and part.get("mimeType") in [
1747                    "text/plain",
1748                    "text/html",
1749                ]:
1750                    data = base64.urlsafe_b64decode(
1751                        part.get("body", {}).get("data", {})
1752                    ).decode("utf-8")
1753                    # Parse the content with BeautifulSoup
1754                    soup = BeautifulSoup(data, "html.parser")
1755                    # Extract the plain text from the HTML content
1756                    email_text = soup.get_text()
1757                    cleaned_text = clean_email(email_text)
1758
1759                    if cleaned_text:
1760                        word_frequency = get_word_frequency(cleaned_text)
1761                        top_capitalized_word = get_top_consecutive_capitalized_words(
1762                            word_frequency
1763                        )
1764                        if not top_capitalized_word:
1765                            if len(cleaned_text) > 0:
1766                                try:
1767                                    return cleaned_text[0][0]
1768                                except IndexError:
1769                                    return cleaned_text[0]
1770                        return top_capitalized_word
1771    except Exception as e:
1772        logger.error("Error getting top word: %s", e)
1773    return ""
1774
1775
1776def get_company_name(id, msg, subject_line):
1777    try:
1778        top_word = get_top_word_in_email_body(id, msg)
1779        from_address = get_email_from_address(msg)
1780        domain = get_email_domain_from_address(from_address)
1781        if not top_word or top_word[0].islower():
1782            # no top word, or top word is not capitalized
1783            if is_generic_email_domain(domain):
1784                # if generic ATS domain like workday, greenhouse, etc.,
1785                # check the last capitalized word(s) in the subject line
1786                return get_last_capitalized_words_in_line(subject_line) or ""
1787            return domain.split(".")[0]
1788        return top_word
1789    except Exception as e:
1790        logger.error("Error getting company name: %s", e)
1791    return ""
1792
1793
1794def get_top_consecutive_capitalized_words(tuples_list):
1795    """
1796    Helper function to parse company name from an email.
1797    We only want the top capitalized words that appear consecutively and with the same frequency.
1798    """
1799    try:
1800        result = []
1801        temp_group = []
1802        max = float("-inf")
1803        for i, (first, second) in enumerate(tuples_list):
1804            is_capitalized = first and first[0].isupper()
1805
1806            if is_capitalized:
1807                if not temp_group:
1808                    max = second
1809                    temp_group.append((first, second))
1810                if temp_group and temp_group[-1][1] == second:
1811                    # Add to the current group if criteria match
1812                    temp_group.append((first, second))
1813                if second < max:
1814                    break
1815                result.append(first)
1816        return " ".join(result)
1817    except Exception as e:
1818        logger.error("Error getting top consecutive capitalized words: %s", e)
1819    return ""
1820
1821
1822---
1823target_repo/backend/utils/file_utils.py
1824---
1825def get_user_filepath(user_id: str) -> str:
1826    """
1827    Each user has their own directory to store their data.
1828    """
1829    return f"users/{user_id}"
1830
1831
1832---
1833target_repo/backend/utils/filter_utils.py
1834---
1835import yaml
1836
1837
1838def parse_simple(term: str, field: str, exclude: bool = False) -> str:
1839    """
1840    Parses a simple combination of search field and search term into a gmail search string.
1841    If exclude is true, a "-" character is prepended to the field.
1842
1843    Args:
1844        term (str): list of terms to parse
1845        field (str): field to search
1846        exclude (bool): whether to exclude the terms
1847    """
1848    if field == "body":
1849        field_str = ""
1850    else:
1851        field_str = f"{field}:"
1852
1853    if exclude:
1854        out_str = f'-{field_str}"{term}"'
1855    else:
1856        out_str = f'{field_str}"{term}"'
1857
1858    return out_str
1859
1860
1861def parse_wildcard(term: str, field: str, exclude: bool = False) -> str:
1862    """
1863    The wildcard * is convenient to use in a yaml file, but it is
1864    not supported by the Gmail API. This function will parse
1865    any number of wildcards as ({field}: "{term1}" AND {field}: "{term2}" AND ...)
1866
1867    If exclude is true, a "-" character is prepended to the field.
1868
1869    Args:
1870        term (str): list of terms to parse
1871        field (str): field to search
1872        exclude (bool): whether to exclude the terms
1873    """
1874    if field == "body":
1875        field_str = ""
1876    else:
1877        field_str = f"{field}:"
1878
1879    if exclude:
1880        sub_terms = term.split(" * ")
1881        out_str = "(" + " AND ".join([f'-{field_str}"{x}"' for x in sub_terms]) + ")"
1882
1883    else:
1884        sub_terms = term.split(" * ")
1885        out_str = "(" + " AND ".join([f'{field_str}"{x}"' for x in sub_terms]) + ")"
1886
1887    return out_str
1888
1889
1890def parse_base_filter_config(filter_path: str) -> str:
1891    with open(filter_path, "r") as fid:
1892        data = yaml.safe_load(fid)
1893
1894    filter_str = ""
1895    for block in data:
1896        sub_filter_str = ""
1897        if block["logic"] == "any":
1898            operator = " OR "
1899        elif block["logic"] == "all":
1900            operator = " AND "
1901
1902        # parse each item based on schema logic
1903        simple_filters = []
1904        wildcard_any_filters = []
1905        if block["how"] == "include":
1906            simple_filters += [
1907                parse_simple(x, block["field"], exclude=False)
1908                for x in block["terms"]
1909                if "*" not in x
1910            ]
1911            wildcard_any_filters += [
1912                parse_wildcard(x, block["field"], exclude=False)
1913                for x in block["terms"]
1914                if "*" in x
1915            ]
1916        if block["how"] == "exclude":
1917            simple_filters += [
1918                parse_simple(x, block["field"], exclude=True) for x in block["terms"]
1919            ]
1920
1921        # join with appropriate operator
1922        if simple_filters + wildcard_any_filters:
1923            sub_filter_str = operator.join(simple_filters + wildcard_any_filters)
1924
1925        # if this isn't the first item then we need to add an extra operator in from
1926        if sub_filter_str:
1927            if len(filter_str) > 0:
1928                sub_filter_str = operator + sub_filter_str
1929            filter_str += sub_filter_str
1930
1931    filter_str = "(" + filter_str + ")"
1932
1933    return filter_str
1934
1935
1936def parse_override_filter_config(filter_path: str):
1937    """not implemented"""
1938    with open(filter_path, "r") as fid:
1939        data = yaml.safe_load(fid)
1940
1941    filter_str_list = []
1942    for block in data:
1943        simple_filters = []
1944        for sub_block in block:
1945            include_terms = sub_block["include_terms"]
1946            exclude_terms = sub_block["exclude_terms"]
1947
1948            # parse each item based on schema logic
1949            if include_terms is not None:
1950                simple_filters += [
1951                    parse_simple(x, sub_block["field"], exclude=False)
1952                    for x in sub_block["include_terms"]
1953                ]
1954            if exclude_terms is not None:
1955                simple_filters += [
1956                    parse_simple(x, sub_block["field"], exclude=True)
1957                    for x in sub_block["exclude_terms"]
1958                ]
1959
1960        # join with an AND operator
1961        if simple_filters:
1962            filter_str_list.append("(" + " AND ".join(simple_filters) + ")")
1963
1964    filter_str = "(" + " OR ".join(filter_str_list) + ")"
1965
1966    return filter_str
1967
1968
1969---
1970target_repo/backend/utils/llm_utils.py
1971---
1972import google.generativeai as genai
1973import time
1974import json
1975from google.ai.generativelanguage_v1beta2 import GenerateTextResponse
1976import logging
1977
1978from utils.config_utils import get_settings
1979
1980settings = get_settings()
1981
1982# Configure Google Gemini API
1983genai.configure(api_key=settings.GOOGLE_API_KEY)
1984model = genai.GenerativeModel("gemini-2.0-flash-lite")
1985logger = logging.getLogger(__name__)
1986logging.basicConfig(
1987    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
1988)
1989
1990logging.basicConfig(
1991    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
1992)
1993
1994def process_email(email_text):
1995    prompt = f"""
1996        Extract the company name, job application status, and job title (role) from the following email. 
1997        
1998        Given the content of an email related to job applications or recruitment, assign one of the following labels to job application status based on the main purpose or outcome of the message:
1999        
2000        Application confirmation
2001        Rejection
2002        Availability request
2003        Information request
2004        Assessment sent
2005        Interview invitation
2006        Did not apply - inbound request
2007        Action required from company
2008        Hiring freeze notification
2009        Withdrew application
2010        Offer made
2011        False positive, not related to job search
2012        Informational outreach
2013
2014        Labeling Rules and Explanations for Job Application Status:
2015
2016        Application confirmation
2017        Assign this label if the email confirms receipt of a job application.
2018        Examples: "We have received your application", "Thank you for applying", "Your application has been submitted".
2019
2020        Rejection
2021        Use this label for emails explicitly stating that the candidate is not moving forward in the process.
2022        Examples: "We regret to inform you...", "We will not be proceeding with your application", "You have not been selected".
2023
2024        Availability request
2025        Assign this label if the company asks for your availability for a call, interview, or meeting.
2026        Examples: "Please let us know your availability", "When are you free for a call?", "Can you share your available times?"
2027
2028        Information request
2029        Use this label if the company requests additional information, documents, or clarification.
2030        Examples: "Please send your portfolio", "Can you provide references?", "We need more information about..."
2031
2032        Assessment sent
2033        Assign this label if the company sends a test, assignment, or assessment for you to complete as part of the hiring process.
2034        Examples: "Please complete the attached assessment", "Here is your coding challenge", "Take-home assignment enclosed".
2035
2036        Interview invitation
2037        Use this label if the company invites you to an interview (phone, video, or onsite).
2038        Examples: "We would like to invite you to interview", "Interview scheduled", "Please join us for an interview".
2039
2040        Did not apply - inbound request
2041        Assign this label if the company or recruiter reaches out to you first, and you did not apply for the position.
2042        Examples: "We found your profile and would like to connect", "Are you interested in this opportunity?", "We came across your resume".
2043
2044        Action required from company
2045        Use this label if the next step is pending from the company, and you are waiting for their response or action.
2046        Examples: "We will get back to you", "Awaiting feedback from the team", "We will contact you with next steps".
2047
2048        Hiring freeze notification
2049        Assign this label if the company notifies you that the position is on hold or canceled due to a hiring freeze.
2050        Examples: "Position is on hold", "Hiring freeze in effect", "We are pausing recruitment".
2051
2052        Withdrew application
2053        Use this label if you (the candidate) have withdrawn your application, or the email confirms your withdrawal.
2054        Examples: "You have withdrawn your application", "Thank you for letting us know you are no longer interested".
2055
2056        Offer made
2057        Assign this label if the company extends a job offer to you.
2058        Examples: "We are pleased to offer you the position", "Offer letter attached", "Congratulations, you have been selected".
2059
2060        False positive, not related to job search
2061        Use this label if the email is not related to job applications, recruitment, or hiring.
2062        Examples: Newsletters, spam, unrelated notifications, or personal emails.
2063
2064        Informational outreach
2065        Assign this label if the company or recruiter is reaching out to share information, updates, or opportunities, but not in direct response to an application or as an explicit invitation to apply.
2066        Examples: "We wanted to let you know about upcoming roles", "Here’s information about our company", "General outreach about our hiring process".
2067
2068        Provide the output in JSON format, for example:  "company_name": "company_name", "job_application_status": "status", "job_title": "job_title"
2069        Remove backticks. Only use double quotes. Enclose key and value pairs in a single pair of curly braces.
2070        Email: {email_text}
2071    """
2072
2073    retries = 3  # Max retries
2074    delay = 60  # Initial delay
2075    for attempt in range(retries):
2076        try:
2077            logger.info("Calling generate_content")
2078            response: GenerateTextResponse = model.generate_content(prompt)
2079            response.resolve()
2080            response_json: str = response.text
2081            logger.info("Received response from model: %s", response_json)
2082            if response_json:
2083                cleaned_response_json = (
2084                    response_json.replace("json", "")
2085                    .replace("`", "")
2086                    .replace("'", '"')
2087                    .strip()
2088                )
2089                cleaned_response_json = (
2090                    response_json.replace("json", "")
2091                    .replace("`", "")
2092                    .replace("'", '"')
2093                    .strip()
2094                )
2095                logger.info("Cleaned response: %s", cleaned_response_json)
2096                return json.loads(cleaned_response_json)
2097            else:
2098                logger.error("Empty response received from the model.")
2099                return None
2100        except Exception as e:
2101            if "429" in str(e):
2102                logger.warning(
2103                    f"Rate limit hit. Retrying in {delay} seconds (attempt {attempt + 1})."
2104                )
2105                time.sleep(delay)
2106            else:
2107                logger.error(f"process_email exception: {e}")
2108                return None
2109    logger.error(f"Failed to process email after {retries} attempts.")
2110    return None
2111
2112
2113
2114---
2115target_repo/backend/db/companies.py
2116---
2117from sqlmodel import SQLModel, Field, UniqueConstraint
2118
2119
2120class Companies(SQLModel, table=True):
2121    __tablename__ = "companies"
2122    company_id: int = Field(default=None, primary_key=True)
2123    company_name: str
2124    company_email_domain: str
2125
2126    __table_args__ = (
2127        # Ensure that company_name and company_email_domain together are unique
2128        UniqueConstraint(
2129            "company_name",
2130            "company_email_domain",
2131            name="unique_company_name_and_domain",
2132        ),
2133    )
2134
2135
2136---
2137target_repo/backend/db/company_jobs.py
2138---
2139from sqlmodel import SQLModel, Field, UniqueConstraint
2140from datetime import datetime
2141
2142
2143class CompanyJobs(SQLModel, table=True):
2144    __tablename__ = "company_jobs"
2145    company_job_id: int = Field(default=None, primary_key=True)
2146    company_id: int = Field(foreign_key="companies.company_id", nullable=False)
2147    company_job_title_id: int | None = Field(
2148        default=None, foreign_key="job_titles.job_title_id", nullable=True
2149    )
2150    company_job_description: str | None = Field(default=None, nullable=True)
2151    company_job_posted_at: datetime = Field(
2152        default_factory=datetime.utcnow, nullable=False
2153    )
2154    company_job_location: str | None = Field(default=None, nullable=True)
2155
2156    __table_args__ = (
2157        # Ensure that company_name and company_email_domain together are unique
2158        UniqueConstraint(
2159            "company_id",
2160            "job_title_id",
2161            "job_location",
2162            "job_posted_at",
2163            name="unique_job",
2164        ),
2165    )
2166
2167
2168---
2169target_repo/backend/db/job_status.py
2170---
2171from sqlmodel import SQLModel, Field
2172
2173
2174class JobStatus(SQLModel, table=True):
2175    __tablename__ = "job_statuses"
2176    status_id: int = Field(default=None, primary_key=True)
2177    status_name: str
2178    status_description: str
2179
2180
2181---
2182target_repo/backend/db/job_titles.py
2183---
2184from sqlmodel import SQLModel, Field, UniqueConstraint
2185
2186
2187class JobTitles(SQLModel, table=True):
2188    __tablename__ = "job_titles"
2189    job_title_id: int = Field(default=None, primary_key=True)
2190    job_title: str
2191
2192    __table_args__ = (UniqueConstraint("job_title", name="unique_job_title"),)
2193
2194
2195---
2196target_repo/backend/db/processing_tasks.py
2197---
2198from sqlmodel import Field, SQLModel, Relationship
2199from datetime import datetime, timezone
2200import sqlalchemy as sa
2201from db.users import Users
2202
2203FINISHED = "finished"
2204STARTED = "started"
2205
2206
2207class TaskRuns(SQLModel, table=True):
2208    __tablename__ = "processing_task_runs"
2209    user_id: str = Field(foreign_key="users.user_id", primary_key=True)
2210    created: datetime = Field(default_factory=datetime.now, nullable=False)
2211    updated: datetime = Field(
2212        sa_column_kwargs={"onupdate": sa.func.now()},
2213        default_factory=lambda: datetime.now(timezone.utc),
2214        nullable=False,
2215    )
2216    status: str = Field(nullable=False)
2217    total_emails: int = 0
2218    processed_emails: int = 0
2219
2220    user: Users = Relationship()
2221
2222
2223---
2224target_repo/backend/db/user_emails.py
2225---
2226from sqlmodel import SQLModel, Field
2227from datetime import datetime
2228
2229class UserEmails(SQLModel, table=True):
2230    __tablename__ = "user_emails"  
2231    id: str = Field(primary_key=True)  # Gmail email ID (not unique globally)
2232    user_id: str = Field(primary_key=True)  # Unique per user (composite key)
2233    company_name: str
2234    application_status: str
2235    received_at: datetime
2236    subject: str
2237    job_title: str
2238    email_from: str  # to avoid 'from' being a reserved key word
2239
2240---
2241target_repo/backend/db/user_job_status.py
2242---
2243from sqlmodel import SQLModel, Field
2244
2245
2246class UserJobStatuses(SQLModel, table=True):
2247    __tablename__ = "user_job_statuses"
2248    user_job_status_id: int = Field(default=None, primary_key=True)
2249    user_id: int = Field(foreign_key="users.user_id", nullable=False)
2250    job_id: int = Field(foreign_key="company_jobs.job_id", nullable=False)
2251    status_id: int = Field(foreign_key="job_statuses.status_id", nullable=False)
2252
2253
2254---
2255target_repo/backend/db/user_jobs.py
2256---
2257from sqlmodel import SQLModel, Field
2258from datetime import datetime
2259
2260
2261class UserJobs(SQLModel, table=True):
2262    __tablename__ = "user_jobs"
2263    user_job_id: int = Field(primary_key=True, nullable=False)
2264    user_id: int = Field(foreign_key="users.user_id", nullable=False)
2265    job_id: int = Field(foreign_key="company_jobs.job_id", nullable=False)
2266    applied_at: datetime
2267
2268
2269---
2270target_repo/backend/db/user_session.py
2271---
2272from sqlmodel import SQLModel, Field
2273from uuid import UUID, uuid4
2274from datetime import datetime, timezone
2275from typing import Optional
2276
2277class UserSession(SQLModel, table=True):
2278    __tablename__ = "user_session"
2279    id: UUID = Field(default_factory=uuid4, primary_key=True)
2280    user_id: int = Field(foreign_key="users.user_id")
2281    session_start: datetime = Field(default_factory=datetime.now(timezone.utc))
2282    session_end: Optional[datetime] = None
2283    user_agent: Optional[str] = None
2284
2285
2286---
2287target_repo/backend/db/users.py
2288---
2289from sqlmodel import SQLModel, Field
2290from pydantic import BaseModel
2291from datetime import datetime
2292
2293class UserData(BaseModel):
2294    user_id: str
2295    user_email: str
2296    start_date: datetime
2297
2298class Users(SQLModel, table=True):
2299    __tablename__ = "users"
2300    user_id: str = Field(default = None, primary_key = True)
2301    user_email: str = Field(nullable=False)                      
2302    start_date: datetime = Field(nullable=False) # Start date for job applications
2303
2304
2305---
2306target_repo/backend/db/utils/user_email_utils.py
2307---
2308from db.user_emails import UserEmails
2309from datetime import datetime, timezone
2310import email.utils
2311import logging
2312from database import engine
2313from sqlmodel import Session, select
2314
2315logger = logging.getLogger(__name__)
2316
2317def parse_email_date(date_str: str) -> datetime:
2318    """
2319    Converts an email date string into a Python datetime object
2320    """
2321    dt = email.utils.parsedate_to_datetime(date_str)
2322    if dt is None:
2323        # default to current UTC datetime
2324        dt = datetime.now(timezone.utc)
2325    return dt
2326
2327
2328def check_email_exists(user_id: str, email_id: str) -> bool:
2329    """
2330    Checks if an email with the given emailId and userId exists in the database.
2331    """
2332    with Session(engine) as session:
2333        statement = select(UserEmails).where(
2334            (UserEmails.user_id == user_id) & (UserEmails.id == email_id)
2335        )
2336        result = session.exec(statement).first()
2337        return result is not None
2338
2339
2340def create_user_email(user, message_data: dict) -> UserEmails:
2341    """
2342    Creates a UserEmail record instance from the provided data.
2343    """
2344    try:
2345        received_at_str = message_data["received_at"]
2346        received_at = parse_email_date(received_at_str)  # parse_email_date function was created as different date formats were being pulled from the data
2347        if check_email_exists(user.user_id, message_data["id"]):
2348            logger.info(f"Email with ID {message_data['id']} already exists in the database.")
2349            return None
2350        return UserEmails(
2351            id=message_data["id"],
2352            user_id=user.user_id,
2353            company_name=message_data["company_name"],
2354            application_status=message_data["application_status"],
2355            received_at=received_at,
2356            subject=message_data["subject"],
2357            job_title=message_data["job_title"],
2358            email_from=message_data["from"]
2359        )
2360    except Exception as e:
2361        logger.error(f"Error creating UserEmail record: {e}")
2362        return None
2363
2364
2365---
2366target_repo/backend/db/utils/user_utils.py
2367---
2368import logging
2369from typing import Optional, Tuple
2370from db.user_emails import UserEmails
2371from sqlmodel import Session, select, func
2372from db.users import Users 
2373from datetime import datetime, timedelta, timezone 
2374
2375logger = logging.getLogger(__name__)
2376
2377def get_last_email_date(user_id: str) -> Optional[datetime]:
2378    from database import engine
2379    """
2380    Checks date of user's most recent email 
2381
2382    """
2383    with Session(engine) as session:
2384        row = session.exec(
2385            select(func.max(UserEmails.received_at))
2386            .where(UserEmails.user_id == user_id)
2387        ).one() # aggregates in SQL to a single row
2388    return row
2389
2390def user_exists(user) -> Tuple[bool, Optional[datetime]]:
2391    from database import engine
2392    """
2393    Checks if user is already in the database
2394
2395    """
2396    with Session(engine) as session:
2397        existing_user = session.exec(select(Users).where(Users.user_id == user.user_id)).first()
2398        if not existing_user:
2399            return False, None
2400        else:
2401            last_fetched_date = get_last_email_date(user.user_id)
2402            return True, last_fetched_date
2403
2404def add_user(user, request, start_date=None) -> Users:
2405    """
2406    Writes user data to the users model and session storage
2407
2408    """
2409    from database import engine
2410    with Session(engine) as session:
2411        # Check if the user already exists in the database
2412        existing_user = session.exec(select(Users).where(Users.user_id == user.user_id)).first()
2413
2414        if not existing_user:
2415
2416            start_date = getattr(user, "start_date", None) or (datetime.now(timezone.utc) - timedelta(days=90))
2417
2418            if isinstance(start_date, datetime):
2419                start_date = start_date.strftime("%Y-%m-%d")
2420
2421            # add a new user record
2422            new_user = Users(
2423                user_id=user.user_id,
2424                user_email=user.user_email,
2425                start_date=start_date
2426            )
2427
2428            session.add(new_user)
2429            session.commit()
2430            session.refresh(new_user)
2431            logger.info(f"Created new user record for user_id: {user.user_id}")
2432
2433            # Write start date to session storage
2434            if isinstance(start_date, str):
2435                request.session["start_date"] = start_date  # Already a string, no need to convert
2436            else:
2437                request.session["start_date"] = start_date.isoformat()  # Convert only if it's a datetime object
2438
2439            return new_user
2440        else:
2441            logger.info(f"User {user.user_id} already exists in the database.")
2442            return existing_user
2443
2444---
2445target_repo/backend/alembic/env.py
2446---
2447from logging.config import fileConfig
2448
2449from sqlalchemy import engine_from_config
2450from sqlalchemy import pool
2451
2452from alembic import context
2453
2454# Import your SQLAlchemy models/metadata
2455import sys
2456import os
2457sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
2458
2459from sqlmodel import SQLModel
2460
2461# This is the Alembic Config object
2462config = context.config
2463
2464# Interpret the config file for Python logging
2465if config.config_file_name is not None:
2466    fileConfig(config.config_file_name)
2467
2468# Set target metadata to SQLModel's metadata since that's likely what your models use
2469target_metadata = SQLModel.metadata
2470
2471# other values from the config, defined by the needs of env.py,
2472# can be acquired:
2473# my_important_option = config.get_main_option("my_important_option")
2474# ... etc.
2475
2476
2477def run_migrations_offline() -> None:
2478    """Run migrations in 'offline' mode."""
2479    url = config.get_main_option("sqlalchemy.url")
2480    context.configure(
2481        url=url,
2482        target_metadata=target_metadata,
2483        literal_binds=True,
2484        dialect_opts={"paramstyle": "named"},
2485    )
2486
2487    with context.begin_transaction():
2488        context.run_migrations()
2489
2490
2491def run_migrations_online() -> None:
2492    """Run migrations in 'online' mode."""
2493    connectable = engine_from_config(
2494        config.get_section(config.config_ini_section),
2495        prefix="sqlalchemy.",
2496        poolclass=pool.NullPool,
2497    )
2498
2499    with connectable.connect() as connection:
2500        context.configure(
2501            connection=connection, target_metadata=target_metadata
2502        )
2503
2504        with context.begin_transaction():
2505            context.run_migrations()
2506
2507
2508if context.is_offline_mode():
2509    run_migrations_offline()
2510else:
2511    run_migrations_online()
2512
2513
2514---
2515target_repo/backend/alembic/versions/6240656d52f6_add_job_title_column.py
2516---
2517"""add_job_title_column
2518
2519Revision ID: 6240656d52f6
2520Revises: b240c664ed46
2521Create Date: 2025-03-16 21:31:17.486275
2522
2523"""
2524from typing import Sequence, Union
2525
2526from alembic import op
2527import sqlalchemy as sa
2528
2529
2530# revision identifiers, used by Alembic.
2531revision: str = '6240656d52f6'
2532down_revision: Union[str, None] = 'b240c664ed46'
2533branch_labels: Union[str, Sequence[str], None] = None
2534depends_on: Union[str, Sequence[str], None] = None
2535
2536
2537def upgrade() -> None:
2538    """Add job_title column to the relevant table."""
2539    op.add_column('user_email', sa.Column('job_title', sa.String(255), nullable=True))
2540
2541
2542
2543def downgrade() -> None:
2544    """Remove job_title column."""
2545    op.drop_column('user_email', 'job_title')
2546
2547---
2548target_repo/backend/alembic/versions/b240c664ed46_change_user_email_id_to_varchar.py
2549---
2550"""change_user_email_id_to_varchar
2551
2552Revision ID: b240c664ed46
2553Revises: 
2554Create Date: 2025-03-16 02:58:30.325992
2555
2556"""
2557from typing import Sequence, Union
2558
2559from alembic import op
2560import sqlalchemy as sa
2561from sqlalchemy.dialects import postgresql
2562
2563# revision identifiers, used by Alembic.
2564revision: str = 'b240c664ed46'
2565down_revision: Union[str, None] = None
2566branch_labels: Union[str, Sequence[str], None] = None
2567depends_on: Union[str, Sequence[str], None] = None
2568
2569
2570def upgrade() -> None:
2571    """Change user_email.id column from integer to varchar and create composite primary key."""
2572    # First, drop any constraints that depend on the id column
2573    op.execute('ALTER TABLE user_email DROP CONSTRAINT IF EXISTS user_email_pkey')
2574    
2575    # Change the column type
2576    op.alter_column('user_email', 'id', 
2577                    existing_type=sa.INTEGER(), 
2578                    type_=sa.VARCHAR(255),
2579                    postgresql_using='id::varchar')
2580    
2581    # Add composite primary key constraint
2582    op.execute('ALTER TABLE user_email ADD PRIMARY KEY (id, user_id)')
2583
2584
2585def downgrade() -> None:
2586    """Revert to integer id column with appropriate primary key."""
2587    # Drop the composite primary key
2588    op.execute('ALTER TABLE user_email DROP CONSTRAINT IF EXISTS user_email_pkey')
2589    
2590    # Change id back to integer (with potential data loss warning if non-numeric ids exist)
2591    op.alter_column('user_email', 'id',
2592                    existing_type=sa.VARCHAR(255),
2593                    type_=sa.INTEGER(),
2594                    postgresql_using='id::integer')
2595    
2596    # Restore original primary key on id only
2597    op.execute('ALTER TABLE user_email ADD PRIMARY KEY (id)')
2598
2599
2600---
2601target_repo/backend/alembic/versions/c256d0279ea6_rename_user_email_table_to_plural.py
2602---
2603"""rename_user_email_table_to_plural
2604
2605Revision ID: c256d0279ea6
2606Revises: 6240656d52f6
2607Create Date: 2025-03-17 03:16:53.078420
2608
2609"""
2610from typing import Sequence, Union
2611
2612from alembic import op
2613import sqlalchemy as sa
2614
2615
2616# revision identifiers, used by Alembic.
2617revision: str = 'c256d0279ea6'
2618down_revision: Union[str, None] = '6240656d52f6'
2619branch_labels: Union[str, Sequence[str], None] = None
2620depends_on: Union[str, Sequence[str], None] = None
2621
2622
2623def upgrade() -> None:
2624    """Rename user_email table to user_emails."""
2625    op.rename_table('user_email', 'user_emails')
2626
2627
2628def downgrade() -> None:
2629    """Rename user_emails table back to user_email."""
2630    op.rename_table('user_emails', 'user_email')
2631
2632---
2633target_repo/backend/routes/auth_routes.py
2634---
2635import datetime
2636import logging
2637from fastapi import APIRouter, Depends, HTTPException, Request, BackgroundTasks
2638from fastapi.responses import RedirectResponse, HTMLResponse
2639from google_auth_oauthlib.flow import Flow
2640
2641from db.utils.user_utils import user_exists
2642from utils.auth_utils import AuthenticatedUser
2643from session.session_layer import create_random_session_string, validate_session
2644from utils.config_utils import get_settings
2645from utils.cookie_utils import set_conditional_cookie
2646from routes.email_routes import fetch_emails_to_db
2647from slowapi import Limiter
2648from slowapi.util import get_remote_address
2649
2650limiter = Limiter(key_func=get_remote_address)
2651
2652# Logger setup
2653logger = logging.getLogger(__name__)
2654
2655# Get settings
2656settings = get_settings()
2657
2658# FastAPI router for Google login
2659router = APIRouter()
2660
2661APP_URL = settings.APP_URL
2662
2663@router.get("/login")
2664@limiter.limit("10/minute")
2665async def login(request: Request, background_tasks: BackgroundTasks):
2666    """Handles Google OAuth2 login and authorization code exchange."""
2667    code = request.query_params.get("code")
2668    flow = Flow.from_client_secrets_file(
2669        settings.CLIENT_SECRETS_FILE,
2670        settings.GOOGLE_SCOPES,
2671        redirect_uri=settings.REDIRECT_URI,
2672    )
2673
2674    try:
2675        if not code:
2676            authorization_url, state = flow.authorization_url(prompt="consent")
2677            return RedirectResponse(url=authorization_url)
2678        logger.info("Authorization code received, exchanging for token...")
2679        try:
2680            flow.fetch_token(code=code)
2681        except Exception as e:
2682            logger.error("Failed to fetch token: %s", e)
2683            return RedirectResponse(
2684                url=f"{settings.APP_URL}/errors?message=permissions_error",
2685                status_code=303
2686            )   
2687        try:
2688            creds = flow.credentials
2689        except Exception as e:
2690            logger.error("Failed to fetch credentials: %s", e)
2691            return RedirectResponse(
2692                url=f"{settings.APP_URL}/errors?message=credentials_error",
2693                status_code=303
2694            )  
2695
2696        if not creds.valid:
2697            creds.refresh(Request())
2698            return RedirectResponse("/login", status_code=303)
2699
2700        user = AuthenticatedUser(creds)
2701        session_id = request.session["session_id"] = create_random_session_string()
2702
2703        # Set session details
2704        try:
2705            token_expiry = creds.expiry.isoformat()
2706        except Exception as e:
2707            logger.error("Failed to parse token expiry: %s", e)
2708            token_expiry = (
2709                datetime.datetime.utcnow() + datetime.timedelta(hours=1)
2710            ).isoformat()
2711
2712        request.session["token_expiry"] = token_expiry
2713        request.session["user_id"] = user.user_id
2714        request.session["creds"] = creds.to_json() 
2715        request.session["access_token"] = creds.token
2716
2717        # NOTE: change redirection once dashboard is completed
2718        exists, last_fetched_date = user_exists(user)
2719        if exists:
2720            logger.info("User already exists in the database.")
2721            response = RedirectResponse(
2722                url=f"{settings.APP_URL}/processing", status_code=303
2723            )
2724            background_tasks.add_task(fetch_emails_to_db, user, request, last_fetched_date, user_id=user.user_id)
2725            logger.info("Background task started for user_id: %s", user.user_id)
2726        else:
2727            request.session["is_new_user"] = True
2728            response = RedirectResponse(
2729                url=f"{settings.APP_URL}/dashboard", status_code=303
2730            )
2731            print("User does not exist")
2732
2733        response = set_conditional_cookie(
2734            key="Authorization", value=session_id, response=response
2735        )
2736
2737        return response
2738    except Exception as e:
2739        logger.error("Login error: %s", e)
2740        return HTMLResponse(content="An error occurred, sorry!", status_code=500)
2741
2742
2743@router.get("/logout")
2744async def logout(request: Request, response: RedirectResponse):
2745    logger.info("Logging out")
2746    request.session.clear()
2747    response.delete_cookie(key="__Secure-Authorization")
2748    response.delete_cookie(key="Authorization")
2749    return RedirectResponse(f"{APP_URL}", status_code=303)
2750
2751
2752@router.get("/me")
2753async def getUser(request: Request, user_id: str = Depends(validate_session)):
2754    if not user_id:
2755        raise HTTPException(
2756            status_code=401, detail="No user id found in session"
2757        )    
2758    return {"user_id": user_id}
2759
2760---
2761target_repo/backend/routes/email_routes.py
2762---
2763import logging
2764from typing import List, Optional
2765from fastapi import APIRouter, Depends, Request, HTTPException, BackgroundTasks
2766from fastapi.responses import HTMLResponse, JSONResponse, RedirectResponse
2767from sqlmodel import Session, select, desc
2768from googleapiclient.discovery import build
2769from db.user_emails import UserEmails
2770from db import processing_tasks as task_models
2771from db.utils.user_email_utils import create_user_email
2772from utils.auth_utils import AuthenticatedUser
2773from utils.email_utils import get_email_ids, get_email
2774from utils.llm_utils import process_email
2775from utils.config_utils import get_settings
2776from session.session_layer import validate_session
2777import database
2778from google.oauth2.credentials import Credentials
2779import json
2780from start_date.storage import get_start_date_email_filter
2781from constants import QUERY_APPLIED_EMAIL_FILTER
2782from datetime import datetime, timedelta
2783from slowapi import Limiter
2784from slowapi.util import get_remote_address
2785
2786limiter = Limiter(key_func=get_remote_address)
2787
2788# Logger setup
2789logger = logging.getLogger(__name__)
2790
2791# Get settings
2792settings = get_settings()
2793APP_URL = settings.APP_URL
2794
2795SECONDS_BETWEEN_FETCHING_EMAILS = 1 * 60 * 60  # 1 hour
2796
2797# FastAPI router for email routes
2798router = APIRouter()
2799
2800@router.get("/processing", response_class=HTMLResponse)
2801async def processing(request: Request, db_session: database.DBSession, user_id: str = Depends(validate_session)):
2802    logging.info("user_id:%s processing", user_id)
2803    if not user_id:
2804        logger.info("user_id: not found, redirecting to login")
2805        return RedirectResponse("/logout", status_code=303)
2806
2807    process_task_run: task_models.TaskRuns = db_session.get(task_models.TaskRuns, user_id)
2808
2809    if process_task_run is None:
2810        raise HTTPException(
2811            status_code=404, detail="Processing has not started."
2812        )
2813
2814    if process_task_run.status == task_models.FINISHED:
2815        logger.info("user_id: %s processing complete", user_id)
2816        return JSONResponse(
2817            content={
2818                "message": "Processing complete",
2819                "processed_emails": process_task_run.processed_emails,
2820                "total_emails": process_task_run.total_emails,
2821            }
2822        )
2823    else:
2824        logger.info("user_id: %s processing not complete for file", user_id)
2825        return JSONResponse(
2826            content={
2827                "message": "Processing in progress",
2828                "processed_emails": process_task_run.processed_emails,
2829                "total_emails": process_task_run.total_emails,
2830            }
2831        )
2832
2833
2834@router.get("/get-emails", response_model=List[UserEmails])
2835@limiter.limit("5/minute")
2836def query_emails(request: Request, db_session: database.DBSession, user_id: str = Depends(validate_session)) -> None:
2837    try:
2838        logger.info(f"Fetching emails for user_id: {user_id}")
2839
2840        # Query emails sorted by date (newest first)
2841        statement = select(UserEmails).where(UserEmails.user_id == user_id).order_by(desc(UserEmails.received_at))
2842        user_emails = db_session.exec(statement).all()
2843
2844        logger.info(f"Found {len(user_emails)} emails for user_id: {user_id}")
2845        return user_emails  # Return empty list if no emails exist
2846
2847    except Exception as e:
2848        logger.error(f"Error fetching emails for user_id {user_id}: {e}")
2849        raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
2850        
2851
2852@router.delete("/delete-email/{email_id}")
2853async def delete_email(request: Request, db_session: database.DBSession, email_id: str, user_id: str = Depends(validate_session)):
2854    """
2855    Delete an email record by its ID for the authenticated user.
2856    """
2857    try:
2858        # Query the email record to ensure it exists and belongs to the user
2859        email_record = db_session.exec(
2860            select(UserEmails).where(
2861                (UserEmails.id == email_id) & (UserEmails.user_id == user_id)
2862            )
2863        ).first()
2864
2865        if not email_record:
2866            logger.warning(f"Email with id {email_id} not found for user_id {user_id}")
2867            raise HTTPException(
2868                status_code=404, detail=f"Email with id {email_id} not found"
2869            )
2870
2871        # Delete the email record
2872        db_session.delete(email_record)
2873        db_session.flush()
2874
2875        logger.info(f"Email with id {email_id} deleted successfully for user_id {user_id}")
2876        return {"message": "Item deleted successfully"}
2877
2878    except Exception as e:
2879        logger.error(f"Error deleting email with id {email_id} for user_id {user_id}: {e}")
2880        raise HTTPException(
2881            status_code=500, detail=f"Failed to delete email: {str(e)}"
2882        )
2883        
2884
2885@router.post("/fetch-emails")
2886@limiter.limit("5/minute")
2887async def start_fetch_emails(
2888    request: Request, background_tasks: BackgroundTasks, user_id: str = Depends(validate_session)
2889):
2890    """Starts the background task for fetching and processing emails."""
2891    
2892    if not user_id:
2893        raise HTTPException(status_code=403, detail="Unauthorized")
2894    logger.info(f"user_id:{user_id} start_fetch_emails")
2895    # Retrieve stored credentials
2896    creds_json = request.session.get("creds")
2897    if not creds_json:
2898        logger.error(f"Missing credentials for user_id: {user_id}")
2899        return HTMLResponse(content="User not authenticated. Please log in again.", status_code=401)
2900
2901    try:
2902        # Convert JSON string back to Credentials object
2903        creds_dict = json.loads(creds_json)
2904        creds = Credentials.from_authorized_user_info(creds_dict)  # Convert dict to Credentials
2905        user = AuthenticatedUser(creds)
2906
2907        logger.info(f"Starting email fetching process for user_id: {user_id}")
2908
2909        # Start email fetching in the background
2910        background_tasks.add_task(fetch_emails_to_db, user, request, user_id=user_id)
2911
2912        return JSONResponse(content={"message": "Email fetching started"}, status_code=200)
2913    except Exception as e:
2914        logger.error(f"Error reconstructing credentials: {e}")
2915        raise HTTPException(status_code=500, detail="Failed to authenticate user")
2916
2917
2918def fetch_emails_to_db(user: AuthenticatedUser, request: Request, last_updated: Optional[datetime] = None, *, user_id: str) -> None:
2919    logger.info(f"Fetching emails to db for user_id: {user_id}")
2920
2921    with Session(database.engine) as db_session:
2922        # we track starting and finishing fetching of emails for each user
2923        process_task_run = (
2924            db_session.query(task_models.TaskRuns).filter_by(user_id=user_id).one_or_none()
2925        )
2926        if process_task_run is None:
2927            # if this is the first time running the task for the user, create a record
2928            process_task_run = task_models.TaskRuns(user_id=user_id)
2929            db_session.add(process_task_run)
2930        elif datetime.now() - process_task_run.updated < timedelta(
2931            seconds=SECONDS_BETWEEN_FETCHING_EMAILS
2932        ):
2933            # limit how frequently emails can be fetched by a specific user
2934            logger.warning(
2935                "Less than an hour since last fetch of emails for user",
2936                extra={"user_id": user_id},
2937            )
2938            return
2939
2940        # this is helpful if the user applies for a new job and wants to rerun the analysis during the same session
2941        process_task_run.processed_emails = 0
2942        process_task_run.total_emails = 0
2943        process_task_run.status = task_models.STARTED
2944
2945        db_session.commit()  # sync with the database so calls in the future reflect the task is already started
2946
2947        start_date = request.session.get("start_date")
2948        logger.info(f"start_date: {start_date}")
2949        start_date_query = get_start_date_email_filter(start_date)
2950        is_new_user = request.session.get("is_new_user")
2951
2952        query = start_date_query
2953        # check for users last updated email
2954        if last_updated:
2955            # this converts our date time to number of seconds 
2956            additional_time = int(last_updated.timestamp())
2957            # we append it to query so we get only emails recieved after however many seconds
2958            # for example, if the newest email you’ve stored was received at 2025‑03‑20 14:32 UTC, we convert that to 1710901920s 
2959            # and tell Gmail to fetch only messages received after March 20, 2025 at 14:32 UTC.
2960            if not start_date or not is_new_user:
2961                query = QUERY_APPLIED_EMAIL_FILTER
2962                query += f" after:{additional_time}"
2963            
2964                logger.info(f"user_id:{user_id} Fetching emails after {last_updated.isoformat()}")
2965        else:
2966            logger.info(f"user_id:{user_id} Fetching all emails (no last_date maybe with start date)")
2967            logger.info(
2968                f"user_id:{user_id} Fetching all emails (no last_date maybe with start date)"
2969            )
2970
2971        service = build("gmail", "v1", credentials=user.creds)
2972
2973        messages = get_email_ids(
2974            query=query, gmail_instance=service
2975        )
2976        # Update session to remove "new user" status
2977        request.session["is_new_user"] = False
2978
2979        if not messages:
2980            logger.info(f"user_id:{user_id} No job application emails found.")
2981            process_task_run = db_session.get(task_models.TaskRuns, user_id)
2982            process_task_run.status = task_models.FINISHED
2983            db_session.commit()
2984            return
2985
2986        logger.info(f"user_id:{user.user_id} Found {len(messages)} emails.")
2987        process_task_run.total_emails = len(messages)
2988        db_session.commit()
2989
2990        email_records = []  # list to collect email records
2991
2992        for idx, message in enumerate(messages):
2993            message_data = {}
2994            # (email_subject, email_from, email_domain, company_name, email_dt)
2995            msg_id = message["id"]
2996            logger.info(
2997                f"user_id:{user_id} begin processing for email {idx + 1} of {len(messages)} with id {msg_id}"
2998            )
2999            process_task_run.processed_emails = idx + 1
3000            db_session.commit()
3001
3002            msg = get_email(message_id=msg_id, gmail_instance=service)
3003
3004            if msg:
3005                try:
3006                    result = process_email(msg["text_content"])
3007                    # if values are empty strings or null, set them to "unknown"
3008                    for key in result.keys():
3009                        if not result[key]:
3010                            result[key] = "unknown"
3011                except Exception as e:
3012                    logger.error(
3013                        f"user_id:{user_id} Error processing email {idx + 1} of {len(messages)} with id {msg_id}: {e}"
3014                    )
3015
3016                if not isinstance(result, str) and result:
3017                    logger.info(
3018                        f"user_id:{user_id} successfully extracted email {idx + 1} of {len(messages)} with id {msg_id}"
3019                    )
3020                    if result.get("job_application_status").lower() == "false positive, not related to job search":
3021                        logger.info(
3022                            f"user_id:{user_id} email {idx + 1} of {len(messages)} with id {msg_id} is a false positive, not related to job search"
3023                        )
3024                        continue  # skip this email if it's a false positive
3025                else:  # processing returned unknown which is also likely false positive
3026                    logger.warning(
3027                        f"user_id:{user_id} failed to extract email {idx + 1} of {len(messages)} with id {msg_id}"
3028                    )
3029                    result = {"company_name": "unknown", "application_status": "unknown", "job_title": "unknown"}
3030
3031                message_data = {
3032                    "id": msg_id,
3033                    "company_name": result.get("company_name", "unknown"),
3034                    "application_status": result.get("job_application_status", "unknown"),
3035                    "received_at": msg.get("date", "unknown"),
3036                    "subject": msg.get("subject", "unknown"),
3037                    "job_title": result.get("job_title", "unknown"),
3038                    "from": msg.get("from", "unknown"),
3039                }
3040                email_record = create_user_email(user, message_data)
3041                if email_record:
3042                    email_records.append(email_record)
3043
3044        # batch insert all records at once
3045        if email_records:
3046            db_session.add_all(email_records)
3047            logger.info(
3048                f"Added {len(email_records)} email records for user {user_id}"
3049            )
3050
3051        process_task_run.status = task_models.FINISHED
3052        db_session.commit()
3053
3054        logger.info(f"user_id:{user_id} Email fetching complete.")
3055
3056
3057---
3058target_repo/backend/routes/file_routes.py
3059---
3060import csv
3061import os
3062import logging
3063import plotly.graph_objects as go
3064from fastapi import APIRouter, HTTPException, Request, Depends
3065from fastapi.responses import FileResponse, RedirectResponse
3066from slowapi import Limiter
3067from slowapi.util import get_remote_address
3068import database
3069from utils.file_utils import get_user_filepath
3070from session.session_layer import validate_session
3071from routes.email_routes import query_emails
3072
3073
3074# Logger setup
3075logger = logging.getLogger(__name__)
3076
3077# FastAPI router for file routes
3078router = APIRouter()
3079limiter = Limiter(key_func=get_remote_address)
3080
3081@router.get("/download-file")
3082async def download_file(request: Request, user_id: str = Depends(validate_session)):
3083    if not user_id:
3084        return RedirectResponse("/logout", status_code=303)
3085    directory = get_user_filepath(user_id)
3086    filename = "emails.csv"
3087    filepath = f"{directory}/{filename}"
3088    if os.path.exists(filepath):
3089        logger.info("user_id:%s downloading from filepath %s", user_id, filepath)
3090        return FileResponse(filepath)
3091    raise HTTPException(status_code=400, detail="File not found")
3092
3093
3094@router.get("/write-to-csv")
3095async def write_to_csv(request: Request, db_session: database.DBSession, user_id: str = Depends(validate_session)):
3096    if not user_id:
3097        return RedirectResponse("/logout", status_code=303)
3098
3099    # Get job related email data from DB
3100    emails = query_emails(request, db_session=db_session, user_id=user_id)
3101    if not emails:
3102        raise HTTPException(status_code=400, detail="No data found to write")
3103
3104    directory = get_user_filepath(user_id)
3105    os.makedirs(directory, exist_ok=True)  # Ensure the directory exists
3106
3107    filename = "emails.csv"
3108    filepath = os.path.join(directory, filename)
3109
3110    # Key: DB field name; Value: Human-readable field name
3111    field_mapping = {
3112        "company_name": "Company Name",
3113        "application_status": "Application Status",
3114        "received_at": "Received At",
3115        "subject": "Subject",
3116        "email_from": "Sender"
3117    }
3118
3119    selected_fields = list(field_mapping.keys())
3120    headers = list(field_mapping.values())
3121
3122    # Filter out unwanted fields
3123    processed_emails = [
3124        {key: value for key, value in email if key in selected_fields} for email in emails
3125    ]
3126
3127    # Write to CSV
3128    with open(filepath, mode="w", newline="") as file:
3129        writer = csv.writer(file)
3130        writer.writerow(headers)
3131        for row in processed_emails:
3132            writer.writerow([row[field] for field in selected_fields])
3133
3134    logger.info("CSV file created at %s", filepath)
3135    return {"message": f"CSV file written successfully at {filepath}"}
3136
3137
3138# Write and download csv
3139@router.get("/process-csv")
3140@limiter.limit("2/minute")
3141async def process_csv(request: Request, db_session: database.DBSession, user_id: str = Depends(validate_session)):
3142    if not user_id:
3143        return RedirectResponse("/logout", status_code=303)
3144
3145    directory = get_user_filepath(user_id)
3146    filename = "emails.csv"
3147    filepath = os.path.join(directory, filename)
3148    
3149    # Get job related email data from DB
3150    emails = query_emails(request, db_session=db_session, user_id=user_id)
3151    if not emails:
3152        raise HTTPException(status_code=400, detail="No data found to write")
3153    # Ensure the directory exists
3154    os.makedirs(directory, exist_ok=True)
3155
3156    # Key: DB field name; Value: Human-readable field name
3157    field_mapping = {
3158        "company_name": "Company Name",
3159        "application_status": "Application Status",
3160        "received_at": "Received At",
3161        "job_title": "Job Title",
3162        "subject": "Subject",
3163        "email_from": "Sender"
3164    }
3165
3166    selected_fields = list(field_mapping.keys())
3167    headers = list(field_mapping.values())
3168
3169    # Filter out unwanted fields
3170    processed_emails = [
3171        {key: value for key, value in email if key in selected_fields} for email in emails
3172    ]
3173
3174    # Write to CSV
3175    with open(filepath, mode="w", newline="") as file:
3176        writer = csv.writer(file)
3177        writer.writerow(headers)
3178        for row in processed_emails:
3179            writer.writerow([row[field] for field in selected_fields])
3180
3181    logger.info("CSV file created at %s", filepath)
3182    
3183    # Download CSV
3184    if os.path.exists(filepath):
3185        logger.info("user_id:%s downloading from filepath %s", user_id, filepath)
3186        return FileResponse(filepath)
3187    
3188    # File not found error
3189    raise HTTPException(status_code=400, detail="File not found")
3190
3191
3192# Write and download sankey diagram
3193@router.get("/process-sankey")
3194@limiter.limit("2/minute")
3195async def process_sankey(request: Request, db_session: database.DBSession, user_id: str = Depends(validate_session)):
3196    # Validate user session, redirect if invalid
3197    if not user_id:
3198        return RedirectResponse("/logout", status_code=303)
3199    
3200    num_applications = 0
3201    num_offers = 0
3202    num_rejected = 0
3203    num_request_for_availability = 0
3204    num_interview_scheduled = 0
3205    num_no_response = 0
3206
3207    # Get job related email data from DB
3208    emails = query_emails(request, db_session=db_session, user_id=user_id)
3209    if not emails:
3210        raise HTTPException(status_code=400, detail="No data found to write")
3211 
3212    for email in emails:
3213        # normalize the output
3214        status = email.application_status.strip().lower()
3215        num_applications += 1   
3216        if status == "offer":
3217            num_offers += 1
3218        elif status == "rejected":
3219            num_rejected += 1
3220        elif status == "request for availability":
3221            num_request_for_availability += 1
3222        elif status == "interview scheduled":
3223            num_interview_scheduled += 1
3224        elif status == "no response":
3225            num_no_response += 1
3226
3227    # Create the Sankey diagram
3228    fig = go.Figure(go.Sankey(
3229        node=dict(label=[f"Applications ({num_applications})", 
3230                         f"Offers ({num_offers})", 
3231                         f"Rejected ({num_rejected})", 
3232                         f"Request for Availability ({num_request_for_availability})", 
3233                         f"Interview Scheduled ({num_interview_scheduled})", 
3234                         f"No Response ({num_no_response})"]),
3235        link=dict(source=[0, 0, 0, 0, 0], target=[1, 2, 3, 4, 5], 
3236                  value=[num_offers, num_rejected, num_request_for_availability, num_interview_scheduled, num_no_response])))
3237
3238
3239    # Define the user's file path and ensure the directory exists
3240    directory = get_user_filepath(user_id)
3241    filename = "sankey_diagram.png"
3242    filepath = os.path.join(directory, filename)
3243
3244    # Ensure the directory exists
3245    os.makedirs(directory, exist_ok=True)
3246
3247    try:
3248        # Save the Sankey diagram as PNG
3249        fig.write_image(filepath)  # Requires Kaleido for image export
3250        logger.info("user_id:%s Sankey diagram saved to %s", user_id, filepath)
3251
3252        # Return the file with correct headers and explicit filename
3253        return FileResponse(
3254            filepath,
3255            media_type="image/png",  # Correct media type for PNG
3256            filename=filename, 
3257            headers={"Content-Disposition": f"attachment; filename={filename}"}  # Ensure correct filename in header
3258        )
3259    except Exception as e:
3260        logger.error("Error generating Sankey diagram for user_id:%s - %s", user_id, str(e))
3261        raise HTTPException(status_code=500, detail="Error generating Sankey diagram")
3262
3263   
3264
3265---
3266target_repo/backend/routes/start_date_routes.py
3267---
3268import logging
3269from fastapi import APIRouter, Request, Form, Depends
3270from fastapi.responses import JSONResponse, HTMLResponse
3271from db.utils.user_utils import add_user
3272import json
3273from utils.auth_utils import AuthenticatedUser
3274from google.oauth2.credentials import Credentials
3275from session.session_layer import validate_session
3276from slowapi import Limiter
3277from slowapi.util import get_remote_address
3278
3279limiter = Limiter(key_func=get_remote_address)
3280
3281# Logger setup
3282logger = logging.getLogger(__name__)
3283
3284api_call_finished = False
3285
3286# FastAPI router for email routes
3287router = APIRouter()
3288
3289@router.post("/set-start-date")
3290@limiter.limit("1/minute")
3291async def set_start_date(request: Request, start_date: str = Form(...), user_id: str = Depends(validate_session)):
3292    """Updates the user's job search start date in the database."""
3293    user_id = request.session.get("user_id")
3294
3295    if not user_id:
3296        return HTMLResponse(content="Invalid request. Please log in again.", status_code=400)
3297
3298    # Retrieve stored credentials
3299    creds_json = request.session.get("creds")
3300    if not creds_json:
3301        logger.error(f"user_id:{user_id} missing credentials /set-start-date")
3302        return HTMLResponse(content="User not authenticated. Please log in again.", status_code=401)
3303
3304    try:
3305        # Convert JSON string back to Credentials object
3306        creds_dict = json.loads(creds_json)
3307        creds = Credentials.from_authorized_user_info(creds_dict)  # Convert dict to Credentials
3308        user = AuthenticatedUser(creds, start_date)  # Corrected: Now passing Credentials object
3309
3310        # Save start date in DB
3311        add_user(user, request, start_date)
3312
3313        # Update session to remove "new user" status
3314        request.session["is_new_user"] = False
3315
3316        logger.info(f"user_id:{user_id} added start date {start_date}")
3317
3318        return JSONResponse(content={"message": "Start date updated successfully"}, status_code=200)
3319    except Exception as e:
3320        logger.error(f"Error reconstructing credentials: {e}")
3321        return HTMLResponse(content="Failed to save start date. Try again.", status_code=500)
3322    
3323def get_start_date(request: Request, user_id: str = Depends(validate_session)) -> str:
3324    """Fetches the user's job search start date from the database."""
3325    # Query the database for the user's start date
3326    logger.info(f"Getting start date for user_id: {user_id}")
3327    return request.session.get("start_date")
3328
3329
3330@router.get("/api/session-data")
3331@limiter.limit("5/minute")
3332async def get_session_data(request: Request, user_id: str = Depends(validate_session)):
3333    """Fetches session data for the user."""
3334    
3335    user_id = request.session.get("user_id")
3336    token_expiry = request.session.get("token_expiry")
3337    session_id = request.session.get("session_id")
3338    is_new_user = request.session.get("is_new_user", False)
3339
3340    logger.info(f"Fetching session data: user_id={user_id}, session_id={session_id}")
3341
3342    if not user_id:
3343        logger.warning("Session data missing user_id. Possible expired or invalid session.")
3344        return JSONResponse(content={"error": "Session expired or invalid"}, status_code=401)
3345
3346    session_data = {
3347        "user_id": user_id,
3348        "token_expiry": token_expiry,
3349        "session_id": session_id,
3350        "is_new_user": is_new_user,
3351    }
3352
3353    logger.info(f"Session data being returned: {session_data}")
3354
3355    return JSONResponse(content=session_data)
3356
3357---
3358target_repo/backend/routes/users_routes.py
3359---
3360import logging
3361from fastapi import APIRouter, Depends, Request, HTTPException
3362from sqlmodel import select
3363from db.user_emails import UserEmails
3364from utils.config_utils import get_settings
3365from session.session_layer import validate_session
3366from routes.email_routes import query_emails
3367import database
3368from slowapi import Limiter
3369from slowapi.util import get_remote_address
3370
3371
3372# Logger setup
3373logger = logging.getLogger(__name__)
3374
3375# Get settings
3376settings = get_settings()
3377APP_URL = settings.APP_URL
3378
3379api_call_finished = False
3380
3381# FastAPI router for email routes
3382router = APIRouter()
3383limiter = Limiter(key_func=get_remote_address)
3384
3385@router.get("/get-response-rate")   
3386@limiter.limit("2/minute")    
3387def response_rate_by_job_title(request: Request, db_session: database.DBSession, user_id: str = Depends(validate_session)):
3388    
3389    try:
3390        # Get job related email data from DB
3391        user_emails = query_emails(request, db_session=db_session, user_id=user_id)
3392
3393        index = 0
3394
3395        # Tracks all job titles and their index in response_rate
3396        job_titles = {}
3397
3398        # Store (company, job_title) tuples to avoid duplicates
3399        companies = []
3400
3401        # List of dictionaries to store job titles and their response rates
3402        response_rate_data = []
3403
3404        for email in user_emails:
3405            if email.job_title not in job_titles:
3406                status = email.application_status.strip().lower()
3407                if status == "request for availability" or status == "offer" or status == "interview scheduled":
3408                    response_rate_data.append({"title": email.job_title, "responses": 1, "total": 1})
3409                else:
3410                    response_rate_data.append({"title": email.job_title, "responses": 0, "total": 1})
3411                companies.append((email.company_name, email.job_title))
3412                job_titles[email.job_title] = index
3413                index += 1
3414            elif (email.company_name, email.job_title) not in companies:
3415                status = email.application_status.strip().lower()
3416                if status == "request for availability" or status == "offer" or status == "interview scheduled":
3417                    response_rate_data[job_titles[email.job_title]]["responses"] += 1
3418                response_rate_data[job_titles[email.job_title]]["total"] += 1
3419                companies.append((email.company_name, email.job_title))
3420
3421        response_rate = []
3422        for data in response_rate_data:
3423            response_rate.append({
3424                "title": data["title"],
3425                "rate": round(data["responses"] / data["total"] * 100, 2)
3426            })
3427
3428        return response_rate
3429    
3430    except Exception as e:
3431        logger.error(f"Error fetching job titles for user_id {user_id}: {e}")
3432        raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
3433
3434@router.get("/user-response-rate")
3435def calculate_response_rate(
3436    request: Request, db_session: database.DBSession, user_id: str = Depends(validate_session)
3437) -> None:
3438    user_emails = db_session.exec(
3439        select(UserEmails).where(UserEmails.user_id == user_id)
3440    ).all()
3441
3442    # if user has no application just return 0.0
3443    total_apps = len(user_emails)
3444    if total_apps == 0:
3445        return 0.0
3446
3447    interview_requests = 0
3448    for email in user_emails:
3449        # using request for avalability as an interview request as it should come before the offer and scheduled interview
3450        if (
3451            email.application_status
3452            and email.application_status.lower() == "request for availability"
3453        ):
3454            interview_requests += 1
3455
3456    response_rate_percent = (interview_requests / total_apps) * 100
3457    return {"value": round(response_rate_percent, 1)}
3458    
3459
3460
3461---
3462target_repo/backend/session/session_layer.py
3463---
3464# app/session/session_layer.py
3465import logging
3466import secrets
3467from datetime import datetime
3468from fastapi import Request
3469from utils.config_utils import get_settings
3470
3471settings = get_settings()
3472
3473def create_random_session_string() -> str:
3474    return secrets.token_urlsafe(32)  # Generates a random URL-safe string
3475
3476
3477def validate_session(request: Request) -> str:
3478    """Retrieves Authorization, session_id, access_token and token_expiry
3479    from request cookies and validates them.
3480    Session ID should match the stored session.
3481    Access token should not be expired.
3482    """
3483    if settings.is_publicly_deployed:
3484         session_authorization = request.cookies.get("__Secure-Authorization")
3485    else:
3486        session_authorization = request.cookies.get("Authorization")
3487
3488    session_id = request.session.get("session_id")
3489    session_access_token = request.session.get("access_token")
3490    token_exp = request.session.get("token_expiry")
3491    user_id = request.session.get("user_id")
3492
3493    if not session_authorization and not session_access_token:
3494        logging.info(
3495            "No Authorization and access_token in session, redirecting to login"
3496        )
3497        return ""
3498
3499    if session_authorization != session_id:
3500        logging.info("Authorization does not match Session Id, redirecting to login")
3501        return ""
3502
3503    if is_token_expired(token_exp):
3504        logging.info("Access_token is expired, redirecting to login")
3505        return ""
3506
3507    logging.info("Valid Session, Access granted.")
3508    return user_id
3509
3510
3511def is_token_expired(iso_expiry: str) -> bool:
3512    """
3513    Converts ISO format timestamp (which serves as the expiry time of the token) to datetime.
3514    If the current time is greater than the expiry time,
3515    the token is expired.
3516    """
3517    if iso_expiry:
3518        datetime_expiry = datetime.fromisoformat(iso_expiry)  # UTC time
3519        difference_in_minutes = (
3520            datetime_expiry - datetime.utcnow()
3521        ).total_seconds() / 60
3522        return difference_in_minutes <= 0
3523
3524    return True
3525
3526
3527---
Vulnerability History

Audit Report Details

🚨 High Risk Vulnerabilities

This is a test vulnerability

⚠️ Low Risk Vulnerabilities

Less critical test vulnerability

Vulnerable Code: