Bitsec (subnet 60)
Back to Reports

Vulnerability History

Date High Risk Low Risk
2025-05-29 1 1

Audit Report Details

3527
Lines of Code
2
Open
0
Resolved
🚨 High Risk Vulnerabilities
⚠️ Low Risk Vulnerabilities

Vulnerable Code:

1# Repo Tree (Python files only, excluding .gitignored files)
2
3├── __init__.py
4├── backend
5│ ├── __init__.py
6│ ├── alembic
7│ │ ├── env.py
8│ │ └── versions
9│ │ ├── 6240656d52f6_add_job_title_column.py
10│ │ ├── b240c664ed46_change_user_email_id_to_varchar.py
11│ │ └── c256d0279ea6_rename_user_email_table_to_plural.py
12│ ├── config.py
13│ ├── constants.py
14│ ├── database.py
15│ ├── db
16│ │ ├── companies.py
17│ │ ├── company_jobs.py
18│ │ ├── job_status.py
19│ │ ├── job_titles.py
20│ │ ├── processing_tasks.py
21│ │ ├── user_emails.py
22│ │ ├── user_job_status.py
23│ │ ├── user_jobs.py
24│ │ ├── user_session.py
25│ │ ├── users.py
26│ │ └── utils
27│ │ ├── user_email_utils.py
28│ │ └── user_utils.py
29│ ├── email_query_filters
30│ ├── main.py
31│ ├── routes
32│ │ ├── auth_routes.py
33│ │ ├── email_routes.py
34│ │ ├── file_routes.py
35│ │ ├── start_date_routes.py
36│ │ └── users_routes.py
37│ ├── session
38│ │ └── session_layer.py
39│ ├── start_date
40│ │ └── storage.py
41│ ├── static
42│ ├── templates
43│ ├── tests
44│ │ ├── __init__.py
45│ │ ├── conftest.py
46│ │ ├── routes
47│ │ │ ├── __init__.py
48│ │ │ ├── conftest.py
49│ │ │ └── test_email_routes.py
50│ │ ├── test_config_utils.py
51│ │ ├── test_constants.py
52│ │ ├── test_email_utils.py
53│ │ ├── test_filter_schema.py
54│ │ └── test_filter_utils.py
55│ └── utils
56│ ├── auth_utils.py
57│ ├── config_utils.py
58│ ├── cookie_utils.py
59│ ├── email_utils.py
60│ ├── file_utils.py
61│ ├── filter_utils.py
62│ └── llm_utils.py
63├── docs
64│ └── use_cases
65├── frontend
66│ ├── app
67│ │ ├── api
68│ │ │ └── subscribe
69│ │ ├── dashboard
70│ │ ├── errors
71│ │ ├── logout
72│ │ ├── preview
73│ │ │ ├── dashboard
74│ │ │ └── processing
75│ │ ├── processing
76│ ├── components
77│ ├── config
78│ ├── public
79│ ├── styles
80│ ├── tests
81│ ├── types
82│ └── utils
83
84
85# Complete repo contents (files-to-prompt output)
86
87target_repo/__init__.py
88---
89
90
91---
92target_repo/backend/__init__.py
93---
94
95
96---
97target_repo/backend/config.py
98---
99import json
100
101from pydantic import field_validator
102from pydantic_settings import BaseSettings, SettingsConfigDict, NoDecode
103from typing import List
104from typing_extensions import Annotated
105import logging
106
107logger = logging.getLogger(__name__)
108
109
110class Settings(BaseSettings):
111 GOOGLE_SCOPES: Annotated[List[str], NoDecode]
112 REDIRECT_URI: str
113 GOOGLE_CLIENT_ID: str
114 GOOGLE_API_KEY: str
115 COOKIE_SECRET: str
116 CLIENT_SECRETS_FILE: str = "credentials.json"
117 ENV: str = "dev"
118 APP_URL: str
119 ORIGIN: str = ".jobba.help"
120 DATABASE_URL: str = "default-for-local"
121 DATABASE_URL_LOCAL_VIRTUAL_ENV: str = (
122 "postgresql://postgres:postgres@localhost:5433/jobseeker_analytics"
123 )
124 DATABASE_URL_DOCKER: str = (
125 "postgresql://postgres:postgres@db:5432/jobseeker_analytics"
126 )
127
128 @field_validator("GOOGLE_SCOPES", mode="before")
129 @classmethod
130 def decode_scopes(cls, v: str) -> List[str]:
131 logger.info("Decoded scopes from string: %s", json.loads(v.strip("'\"")))
132 return json.loads(v.strip("'\""))
133
134 @property
135 def is_publicly_deployed(self) -> bool:
136 return self.ENV in ["prod", "staging"]
137
138 model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="allow")
139
140
141settings = Settings(_env_file=".env", _env_file_encoding="utf-8")
142
143
144---
145target_repo/backend/constants.py
146---
147"""
148This file contains the main constants used in the application.
149"""
150
151from datetime import datetime, timedelta
152from pathlib import Path
153from utils.filter_utils import (
154 parse_base_filter_config,
155) # , parse_override_filter_config
156
157
158GENERIC_ATS_DOMAINS = [
159 "us.greenhouse-mail.io",
160 "smartrecruiters.com",
161 "linkedin.com",
162 "ashbyhq.com",
163 "hire.lever.co",
164 "hi.wellfound.com",
165 "talent.icims.com",
166 "myworkday.com",
167 "otta.com",
168]
169
170DEFAULT_DAYS_AGO = 30
171# Get the current date
172current_date = datetime.now()
173
174# Subtract 30 days
175date_days_ago = current_date - timedelta(days=DEFAULT_DAYS_AGO)
176
177# Format the date in the required format (YYYY/MM/DD)
178formatted_date = date_days_ago.strftime("%Y/%m/%d")
179
180APPLIED_FILTER_PATH = (
181 Path(__file__).parent / "email_query_filters" / "applied_email_filter.yaml"
182)
183APPLIED_FILTER_OVERRIDES_PATH = (
184 Path(__file__).parent
185 / "email_query_filters"
186 / "applied_email_filter_overrides.yaml"
187)
188QUERY_APPLIED_EMAIL_FILTER = (
189 f"after:{formatted_date} AND ({parse_base_filter_config(APPLIED_FILTER_PATH)})"
190)
191
192# ------ implement override filter later!! #
193# OR \n"
194# f"{parse_override_filter_config(APPLIED_FILTER_OVERRIDES_PATH)})"
195# )
196# label:jobs -label:query4
197
198---
199target_repo/backend/database.py
200---
201import os
202from typing import Annotated
203from sqlmodel import SQLModel, create_engine, Session
204from utils.config_utils import get_settings
205from sqlalchemy.ext.declarative import declarative_base
206from sqlalchemy.orm import sessionmaker
207import fastapi
208
209
210def create_db_and_tables():
211 SQLModel.metadata.create_all(engine)
212
213def get_session():
214 return Session(engine)
215
216
217def request_session():
218 session = get_session()
219
220 with session.begin():
221 yield session
222
223
224DBSession = Annotated[Session, fastapi.Depends(request_session)]
225
226settings = get_settings()
227IS_DOCKER_CONTAINER = os.environ.get("IS_DOCKER_CONTAINER", 0)
228if IS_DOCKER_CONTAINER:
229 DATABASE_URL = settings.DATABASE_URL_DOCKER
230elif settings.is_publicly_deployed:
231 DATABASE_URL = settings.DATABASE_URL
232else:
233 DATABASE_URL = settings.DATABASE_URL_LOCAL_VIRTUAL_ENV
234
235engine = create_engine(DATABASE_URL)
236SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
237Base = declarative_base()
238
239---
240target_repo/backend/main.py
241---
242import logging
243
244from fastapi import FastAPI, HTTPException, Request, Depends
245from fastapi.responses import HTMLResponse
246from fastapi.staticfiles import StaticFiles
247from fastapi.templating import Jinja2Templates
248from starlette.middleware.sessions import SessionMiddleware
249from fastapi.middleware.cors import CORSMiddleware
250from slowapi import Limiter
251from slowapi.util import get_remote_address
252from slowapi.errors import RateLimitExceeded
253from slowapi.middleware import SlowAPIMiddleware
254from db.users import UserData
255from db.utils.user_utils import add_user
256from utils.config_utils import get_settings
257from session.session_layer import validate_session
258from contextlib import asynccontextmanager
259from database import create_db_and_tables
260
261# Import routes
262from routes import email_routes, auth_routes, file_routes, users_routes, start_date_routes
263
264@asynccontextmanager
265async def lifespan(app: FastAPI):
266 create_db_and_tables()
267 yield
268
269app = FastAPI(lifespan=lifespan)
270settings = get_settings()
271APP_URL = settings.APP_URL
272app.add_middleware(SessionMiddleware, secret_key=settings.COOKIE_SECRET)
273app.mount("/static", StaticFiles(directory="static"), name="static")
274
275# Register routes
276app.include_router(auth_routes.router)
277app.include_router(email_routes.router)
278app.include_router(file_routes.router)
279app.include_router(users_routes.router)
280app.include_router(start_date_routes.router)
281
282limiter = Limiter(key_func=get_remote_address)
283app.state.limiter = limiter # Ensure limiter is assigned
284
285# Configure CORS
286if settings.is_publicly_deployed:
287 # Production CORS settings
288 origins = ["https://www.jobba.help", "https://www.staging.jobba.help",
289 "https://www.app.justajobapp.com", "https://www.api.justajobapp.com"]
290else:
291 # Development CORS settings
292 origins = [
293 "http://localhost:3000", # Assuming frontend runs on port 3000
294 "http://127.0.0.1:3000",
295 ]
296
297# Add SlowAPI middleware for rate limiting
298app.add_middleware(SlowAPIMiddleware)
299
300# Add CORS middleware
301app.add_middleware(
302 CORSMiddleware,
303 allow_origins=origins, # Allow frontend origins
304 allow_credentials=True,
305 allow_methods=["*"], # Allow all HTTP methods (GET, POST, etc.)
306 allow_headers=["*"], # Allow all headers
307)
308
309app.add_middleware(
310 CORSMiddleware,
311 allow_origins=origins, # Allow frontend origins
312 allow_credentials=True,
313 allow_methods=["*"], # Allow all HTTP methods (GET, POST, etc.)
314 allow_headers=["*"], # Allow all headers
315)
316
317# Set up Jinja2 templates
318templates = Jinja2Templates(directory="templates")
319
320logger = logging.getLogger(__name__)
321logging.basicConfig(level=logging.DEBUG, format="%(levelname)s - %(message)s")
322
323
324# Rate limit exception handler
325@app.exception_handler(RateLimitExceeded)
326async def rate_limit_exceeded_handler(request: Request, exc: RateLimitExceeded):
327 raise HTTPException(
328 status_code=429,
329 detail="Too many requests. Please try again later.",
330 )
331
332
333@app.post("/api/add-user")
334@limiter.limit("3/minute")
335async def add_user_endpoint(user_data: UserData, request: Request, user_id: str = Depends(validate_session)):
336 """
337 This endpoint adds a user to the database and session storage
338 """
339 try:
340 add_user(user_data, request)
341 return {"message": "User added successfully"}
342 except Exception as e:
343 # Log the error for debugging purposes
344 logger.error(f"An error occurred while adding user: {e}")
345 return {"error": "An error occurred while adding the user."}
346
347
348@app.get("/")
349async def root(request: Request, response_class=HTMLResponse):
350 return templates.TemplateResponse("homepage.html", {"request": request})
351
352# Run the app using Uvicorn
353if __name__ == "__main__":
354 import uvicorn
355
356 uvicorn.run(app, host="0.0.0.0", port=8000)
357
358---
359target_repo/backend/start_date/storage.py
360---
361"""
362This file contains the main constants used in the application.
363"""
364from pathlib import Path
365from utils.filter_utils import (
366 parse_base_filter_config,
367)
368from constants import QUERY_APPLIED_EMAIL_FILTER
369
370APPLIED_FILTER_PATH = (
371 Path(__file__).parent.parent / "email_query_filters" / "applied_email_filter.yaml"
372)
373
374def get_start_date_email_filter(start_date: str) -> str:
375 if not start_date:
376 return QUERY_APPLIED_EMAIL_FILTER
377
378 START_DATE_EMAIL_FILTER = (
379 f"after:{start_date} AND ({parse_base_filter_config(APPLIED_FILTER_PATH)})"
380 )
381 return START_DATE_EMAIL_FILTER
382
383---
384target_repo/backend/tests/__init__.py
385---
386
387
388---
389target_repo/backend/tests/conftest.py
390---
391import sys
392import os
393
394import pytest
395from testcontainers.postgres import PostgresContainer
396import sqlalchemy as sa
397from sqlalchemy.orm import Session
398from sqlmodel import SQLModel
399
400# Add the parent directory to sys.path
401sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
402os.chdir("./backend")
403
404import database # noqa: E402
405
406
407@pytest.fixture(scope="session")
408def postgres_container():
409 with PostgresContainer("postgres:13") as postgres:
410 yield postgres
411
412
413@pytest.fixture
414def engine(postgres_container: PostgresContainer, monkeypatch):
415 test_engine = sa.create_engine(
416 sa.URL.create(
417 "postgresql",
418 username=postgres_container.username,
419 password=postgres_container.password,
420 host=postgres_container.get_container_host_ip(),
421 port=postgres_container.get_exposed_port(postgres_container.port),
422 database=postgres_container.dbname,
423 )
424 )
425
426 monkeypatch.setattr(database, "engine", test_engine)
427
428 database.create_db_and_tables()
429
430 yield test_engine
431
432 with test_engine.begin() as transaction:
433 transaction.execute(
434 sa.text("SET session_replication_role = :role"), {"role": "replica"}
435 )
436 for table in SQLModel.metadata.tables.values():
437 transaction.execute(table.delete())
438
439
440@pytest.fixture
441def db_session(engine, monkeypatch):
442 with Session(database.engine) as session:
443 yield session
444
445
446---
447target_repo/backend/tests/test_config_utils.py
448---
449from unittest.mock import patch
450from utils.config_utils import get_settings
451from config import Settings
452import pytest
453import json
454import os
455
456
457@pytest.fixture(scope="session", autouse=True)
458def setup_static_directory():
459 static_dir = os.path.join(os.path.dirname(__file__), "../static")
460 if not os.path.exists(static_dir):
461 os.makedirs(static_dir)
462
463
464@patch("utils.config_utils.config.Settings")
465def test_get_settings_only_called_once_with_lru(mock_settings_call):
466 get_settings.cache_clear()
467 get_settings()
468 get_settings()
469 # Ensure the Settings constructor is called only once due to lru_cache
470 mock_settings_call.assert_called_once()
471 get_settings.cache_clear()
472
473
474def test_import_settings_does_not_raise_error():
475 import backend.utils.llm_utils # noqa: F401
476 import backend.utils.auth_utils # noqa: F401
477
478
479def test_decode_scopes_valid_json():
480 input_str = '["https://www.googleapis.com/auth/gmail.readonly", "openid"]'
481 expected_output = ["https://www.googleapis.com/auth/gmail.readonly", "openid"]
482 assert Settings.decode_scopes(input_str) == expected_output
483
484
485def test_decode_scopes_with_extra_quotes():
486 input_str = '\'["https://www.googleapis.com/auth/gmail.readonly", "openid"]\''
487 expected_output = ["https://www.googleapis.com/auth/gmail.readonly", "openid"]
488 assert Settings.decode_scopes(input_str) == expected_output
489
490
491def test_decode_scopes_invalid_json():
492 input_str = '["https://www.googleapis.com/auth/gmail.readonly", "openid"'
493 with pytest.raises(json.JSONDecodeError):
494 Settings.decode_scopes(input_str)
495
496
497def test_decode_scopes_empty_string():
498 input_str = ""
499 with pytest.raises(json.JSONDecodeError):
500 Settings.decode_scopes(input_str)
501
502
503def test_prod_is_publicly_deployed_true():
504 settings = Settings(ENV="prod")
505 assert settings.is_publicly_deployed
506
507
508def test_dev_is_publicly_deployed_false():
509 settings = Settings(ENV="dev")
510 assert not settings.is_publicly_deployed
511
512
513def test_staging_is_publicly_deployed_true():
514 settings = Settings(ENV="staging")
515 assert settings.is_publicly_deployed
516
517
518---
519target_repo/backend/tests/test_constants.py
520---
521from pathlib import Path
522
523SUBJECT_LINE = "Invitation from an unknown sender: Interview with TestCompanyName @ Thu May 2, 2024 11:00am - 12pm (PDT) ([email protected])"
524SAMPLE_MESSAGE = {
525 "id": "abc123",
526 "threadId": "abc123",
527 "labelIds": ["IMPORTANT", "CATEGORY_PERSONAL", "Label_1"],
528 "snippet": "Interview with TestCompanyName Unknown sender This event from [email protected] won't appear in your calendar unless you say you know the sender. Know this sender? When Thursday May 9, 2024 ⋅ 02:40pm –",
529 "payload": {
530 "partId": "",
531 "mimeType": "multipart/mixed",
532 "filename": "",
533 "headers": [
534 {"name": "Delivered-To", "value": "[email protected]"},
535 {
536 "name": "Received",
537 "value": "by 2024:abc:6000:2000:b0:200:1000:5000 with SMTP id cub; Thu, 2 May 2024 16:45:00 -0700 (PDT)",
538 },
539 {
540 "name": "X-Received",
541 "value": "by 2024:abc:6000:2000:b0:200:1000:5000 with SMTP id def567-890jkl.9.000000000000; Thu, 2 May 2024 16:45:00 -0700 (PDT)",
542 },
543 {
544 "name": "ARC-Seal",
545 "value": "redacted-ARC-value",
546 },
547 {
548 "name": "ARC-Message-Signature",
549 "value": "i=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com; s=arc-00000000; h=to:from:subject:date:message-id:sender:reply-to:mime-version :dkim-signature:dkim-signature; bh=pqr123; fh=AZ123/PST=; b=GAH",
550 },
551 {
552 "name": "ARC-Authentication-Results",
553 "value": "i=1; mx.google.com; dkim=pass [email protected] header.s=10101101 header.b=WOOHOO; dkim=pass [email protected] header.s=google header.b=di8r; spf=pass (google.com: domain of [email protected] designates 000.00.000.00 as permitted sender) [email protected]; dmarc=pass (p=NONE sp=NONE dis=NONE) header.from=testcompanyname.com",
554 },
555 {"name": "Return-Path", "value": "<[email protected]>"},
556 {
557 "name": "Received",
558 "value": "from mail-fff-a00.google.com (mail-fff-a00.google.com. [000.00.000.00]) by mx.google.com with SMTPS id def567-890mno.0.2024.05.02.16.45.00 for <[email protected]> (Google Transport Security); Thu, 2 May 2024 16:45:00 -0700 (PDT)",
559 },
560 {
561 "name": "Received-SPF",
562 "value": "pass (google.com: domain of [email protected] designates 000.00.000.00 as permitted sender) client-ip=000.00.000.00;",
563 },
564 {
565 "name": "Authentication-Results",
566 "value": "mx.google.com; dkim=pass [email protected] header.s=10101101 header.b=WOOHOO; dkim=pass [email protected] header.s=google header.b=di8r; spf=pass (google.com: domain of [email protected] designates 000.00.000.00 as permitted sender) [email protected]; dmarc=pass (p=NONE sp=NONE dis=NONE) header.from=testcompanyname.com",
567 },
568 {
569 "name": "DKIM-Signature",
570 "value": "v=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com; s=10101101; t=1111111111; x=1111111111; dara=google.com; h=to:from:subject:date:message-id:sender:reply-to:mime-version:from :to:cc:subject:date:message-id:reply-to; bh=pqr123; b=GAH",
571 },
572 {
573 "name": "DKIM-Signature",
574 "value": "v=1; a=rsa-sha256; c=relaxed/relaxed; d=testcompanyname.com; s=google; t=1111111111; x=1111111111; dara=google.com; h=to:from:subject:date:message-id:sender:reply-to:mime-version:from :to:cc:subject:date:message-id:reply-to; bh=pqr123; b=GAH",
575 },
576 {
577 "name": "X-Google-DKIM-Signature",
578 "value": "v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=10101101; t=1111111111; x=1111111111; h=to:from:subject:date:message-id:sender:reply-to:mime-version :x-gm-message-state:from:to:cc:subject:date:message-id:reply-to; bh=pqr123; b=BLAH",
579 },
580 {
581 "name": "X-Gm-Message-State",
582 "value": "AGH",
583 },
584 {
585 "name": "X-Google-Smtp-Source",
586 "value": "AGH",
587 },
588 {"name": "MIME-Version", "value": "1.0"},
589 {
590 "name": "X-Received",
591 "value": "by 2222:abc:600:2000:d0:777:9000:4000 with SMTP id def567-890ghi.10.1111111111566; Thu, 2 May 2024 16:45:00 -0700 (PDT)",
592 },
593 {
594 "name": "Reply-To",
595 "value": "Recruiter Name <[email protected]>",
596 },
597 {
598 "name": "Sender",
599 "value": "Google Calendar <[email protected]>",
600 },
601 {
602 "name": "Message-ID",
603 "value": "<[email protected]>",
604 },
605 {"name": "Date", "value": "Thu, 2 May 2024 16:45:00 +0000"},
606 {
607 "name": "Subject",
608 "value": "Invitation from an unknown sender: Interview with TestCompanyName @ Thu May 2, 2024 11:00am - 12pm (PDT) ([email protected])",
609 },
610 {
611 "name": "From",
612 "value": "Recruiter Name <[email protected]>",
613 },
614 {"name": "To", "value": "[email protected]"},
615 {
616 "name": "Content-Type",
617 "value": 'multipart/mixed; boundary="000000000000"',
618 },
619 ],
620 "body": {"size": 0},
621 "parts": [
622 {
623 "partId": "0",
624 "mimeType": "multipart/alternative",
625 "filename": "",
626 "headers": [
627 {
628 "name": "Content-Type",
629 "value": 'multipart/alternative; boundary="000000000000"',
630 }
631 ],
632 "body": {"size": 0},
633 "parts": [
634 {
635 "partId": "0.0",
636 "mimeType": "text/plain",
637 "filename": "",
638 "headers": [
639 {
640 "name": "Content-Type",
641 "value": 'text/plain; charset="UTF-8"; format=flowed; delsp=yes',
642 },
643 {"name": "Content-Transfer-Encoding", "value": "base64"},
644 ],
645 "body": {
646 "size": 2000,
647 "data": "abc",
648 },
649 },
650 {
651 "partId": "0.1",
652 "mimeType": "text/html",
653 "filename": "",
654 "headers": [
655 {
656 "name": "Content-Type",
657 "value": 'text/html; charset="UTF-8"',
658 },
659 {
660 "name": "Content-Transfer-Encoding",
661 "value": "quoted-printable",
662 },
663 ],
664 "body": {
665 "size": 30000,
666 "data": "abc",
667 },
668 },
669 {
670 "partId": "0.2",
671 "mimeType": "text/calendar",
672 "filename": "invite.ics",
673 "headers": [
674 {
675 "name": "Content-Type",
676 "value": 'text/calendar; charset="UTF-8"; method=REQUEST',
677 },
678 {"name": "Content-Transfer-Encoding", "value": "7bit"},
679 ],
680 "body": {
681 "attachmentId": "",
682 "size": 1000,
683 },
684 },
685 ],
686 },
687 {
688 "partId": "1",
689 "mimeType": "application/ics",
690 "filename": "invite.ics",
691 "headers": [
692 {
693 "name": "Content-Type",
694 "value": 'application/ics; name="invite.ics"',
695 },
696 {
697 "name": "Content-Disposition",
698 "value": 'attachment; filename="invite.ics"',
699 },
700 {"name": "Content-Transfer-Encoding", "value": "base64"},
701 ],
702 "body": {
703 "attachmentId": "",
704 "size": 1000,
705 },
706 },
707 ],
708 },
709 "sizeEstimate": 33333,
710 "historyId": "22222222",
711 "internalDate": "1111111111000",
712}
713
714DESIRED_PASS_APPLIED_EMAIL_FILTER_SUBJECT = [
715 "Thank you for your Application!",
716 "Jobba, your application was sent to The Huts",
717 "Your Interview with",
718 "Thank you for your job application"
719]
720
721DESIRED_FAIL_APPLIED_EMAIL_FILTER_FROM = [
722 "[email protected]", # made up, would be better to capture the real example
730]
731
732DESIRED_FAIL_APPLIED_EMAIL_FILTER_SUBJECT = [
733 "Apply to",
734 "Apply now",
735 "New job",
736 "Job Search Council Matching - Next Steps"
737]
738
739DESIRED_PASS_APPLIED_EMAIL_FILTER_FROM = ["[email protected]", "myworkday.com"]
740
741SAMPLE_FILTER_PATH = Path(__file__).parent / "sample_base_filter.yaml"
742EXPECTED_SAMPLE_QUERY_STRING = """(subject:"application has been submitted"
743 OR (subject:"application to" AND subject:"successfully submitted")
744 OR from:"[email protected]"
745 AND -from:"[email protected]"
746 AND -subject:"watering")"""
747
748
749---
750target_repo/backend/tests/test_email_utils.py
751---
752from unittest import mock
753import pytest
754
755from tests.test_constants import SAMPLE_MESSAGE, SUBJECT_LINE
756import utils.email_utils as email_utils
757import db.utils.user_email_utils as user_email_utils
758
759def test_get_top_consecutive_capitalized_words():
760 test_cases = {
761 (
762 ("Hello", 10), # capitalized, highest frequency, prioritize
763 ("World", 8), # capitalized, lower frequency, ignore
764 ): "Hello",
765 (
766 ("Hello", 10), # capitalized, highest frequency, prioritize
767 ("World", 10), # capitalized, highest frequency, add to result
768 ("How", 5), # capitalized, lower frequency, ignore
769 ): "Hello World",
770 (
771 ("hello", 5), # not capitalized, highest frequency, ignore
772 ("World", 5), # capitalized, highest frequency, prioritize
773 ("How", 5), # capitalized, highest frequency, add to result
774 ("are", 5), # not capitalized, highest frequency, ignore
775 ): "World How",
776 (
777 ("hello", 5), # not capitalized, highest frequency, ignore
778 ("world", 5), # capitalized, highest frequency, prioritize
779 ("how", 5), # capitalized, highest frequency, add to result
780 ("are", 5), # not capitalized, highest frequency, ignore
781 ): "", # no consecutive capitalized words
782 }
783 for word_list, expected_value in test_cases.items():
784 result = email_utils.get_top_consecutive_capitalized_words(word_list)
785 assert result == expected_value
786
787
788def test_is_valid_email():
789 email_test_cases = {
790 "[email protected]": True,
791 "[email protected]": False, # Invalid domain
792 "no-reply.com": False, # Missing @
793 }
794 for email, expected_value in email_test_cases.items():
795 is_valid = email_utils.is_valid_email(email)
796 assert is_valid == expected_value, "email: %s" % email
797
798
799def test_is_email_automated():
800 email_test_cases = {
801 "[email protected]": True,
802 "[email protected]": True,
803 "[email protected]": True,
804 "[email protected]": True,
805 "[email protected]": True,
806 "[email protected]": False,
807 }
808 for email, expected_value in email_test_cases.items():
809 is_automated = email_utils.is_automated_email(email)
810 assert is_automated == expected_value, "email: %s" % email
811
812
813def test_get_email_subject_line():
814 subject_line = email_utils.get_email_subject_line(SAMPLE_MESSAGE)
815 assert (
816 subject_line
817 == "Invitation from an unknown sender: Interview with TestCompanyName @ Thu May 2, 2024 11:00am - 12pm (PDT) ([email protected])"
818 )
819
820
821def test_get_email_from_address():
822 from_address = email_utils.get_email_from_address(SAMPLE_MESSAGE)
823 assert from_address == "[email protected]"
824
825
826def test_get_email_domain():
827 from_email_domain = email_utils.get_email_domain_from_address(
829 )
830 assert from_email_domain == "testcompanyname.com"
831
832
833def test_is_generic_email_domain():
834 assert email_utils.is_generic_email_domain("hire.lever.co")
835 assert email_utils.is_generic_email_domain("us.greenhouse-mail.io")
836
837
838def test_get_last_capitalized_words_in_line():
839 last_capitalized_words = email_utils.get_last_capitalized_words_in_line(
840 "Thank you for your application to CompanyName"
841 )
842 assert last_capitalized_words == "CompanyName"
843
844
845def test_get_company_name_returns_email_domain():
846 company_name = email_utils.get_company_name(
847 id="abc123", msg=SAMPLE_MESSAGE, subject_line=SUBJECT_LINE
848 )
849 assert company_name == "testcompanyname"
850
851
852def test_get_company_name_returns_top_word():
853 """Default behavior for company name is to return the
854 highest frequency word that appears in the email body."""
855 with mock.patch(
856 "utils.email_utils.get_top_word_in_email_body", return_value="FakeCompany"
857 ):
858 company_name = email_utils.get_company_name(
859 id="abc123", msg=SAMPLE_MESSAGE, subject_line=SUBJECT_LINE
860 )
861 assert company_name == "FakeCompany"
862
863
864def test_get_company_name_returns_last_capital_word_in_subject_line():
865 """Default behavior for company name is to return the
866 highest frequency word that appears in the email body."""
867 with (
868 mock.patch(
869 "utils.email_utils.get_top_word_in_email_body", return_value="interview"
870 ),
871 mock.patch(
872 "utils.email_utils.get_email_from_address",
873 return_value="[email protected]",
874 ),
875 ):
876 company_name = email_utils.get_company_name(
877 id="abc123",
878 msg=SAMPLE_MESSAGE,
879 subject_line="Thanks for interviewing with CoolCompany",
880 )
881 assert company_name == "CoolCompany"
882
883
884def test_get_email_received_at_timestamp():
885 received_at = email_utils.get_received_at_timestamp(1, SAMPLE_MESSAGE)
886 assert received_at == "Thu, 2 May 2024 16:45:00 +0000"
887
888
889@pytest.fixture
890def mock_user():
891 user = mock.MagicMock()
892 user.user_id = "test_user_123"
893 return user
894
895
896@pytest.fixture
897def message_data_with_list_values():
898 """Message data where received_at is a list instead of a string"""
899 return {
900 "id": "19501385930c533f",
901 "company_name": "",
902 "application_status": "",
903 "received_at": "Thu, 13 Feb 2025 21:30:24 +0000 (UTC)",
904 "subject": "Message replied: Are you looking for Remote opportunities?",
905 "job_title": "",
906 "from": "Tester Recruiter <[email protected]>"
907 }
908
909
910@mock.patch('db.utils.user_email_utils.check_email_exists')
911def test_create_user_email_with_list_values(mock_check_email, mock_user, message_data_with_list_values, caplog):
912 """Test that create_user_email handles message_data_with_list_values correctly"""
913 mock_check_email.return_value = False
914 result = user_email_utils.create_user_email(mock_user, message_data_with_list_values)
915 assert result is not None # user email created successfully
916
917
918---
919target_repo/backend/tests/test_filter_schema.py
920---
921"""
922these tests are intended to verify that the changes made to filter yamls will yield the
923desired results. Note that these tests DO NOT make any checks against functions in
924filter_utils. If you make changes there, the correct tests are found in test_filter_utils.
925
926tests for override filters have not yet been implemented
927"""
928
929import pytest
930from pathlib import Path
931import yaml
932from typing import List, Dict, Union
933import re
934from constants import APPLIED_FILTER_PATH # , APPLIED_FILTER_OVERRIDES_PATH
935from tests.test_constants import (
936 DESIRED_FAIL_APPLIED_EMAIL_FILTER_SUBJECT,
937 DESIRED_PASS_APPLIED_EMAIL_FILTER_FROM,
938 SAMPLE_FILTER_PATH,
939)
940
941FilterConfigType = List[Dict[str, Union[str, int, bool, list, dict]]]
942
943FILTER_CONFIG_DIR = Path(__file__).parent.parent / "email_query_filters"
944
945
946def get_base_filter_config_paths() -> List[Path]:
947 return [SAMPLE_FILTER_PATH] + [
948 x for x in FILTER_CONFIG_DIR.iterdir() if "override" not in str(x)
949 ]
950
951
952def get_override_filter_config_paths() -> List[Path]:
953 return [x for x in FILTER_CONFIG_DIR.iterdir() if "override" in str(x)]
954
955
956def load_filter_config(filter_path: str) -> FilterConfigType:
957 with open(filter_path, "r") as fid:
958 filter_config = yaml.safe_load(fid)
959 return filter_config
960
961
962def validate_schema_block_order(filter_config: FilterConfigType) -> bool:
963 """
964 Validates that 'exclude' blocks appear after 'include' blocks in the schema.
965 """
966
967 include_seen = False
968 for block in filter_config:
969 how = block.get("how")
970 if how == "include":
971 include_seen = True
972 elif how == "exclude" and not include_seen:
973 return False # Exclude block before any include block
974
975 return True
976
977
978@pytest.mark.parametrize(
979 "filter_config", [load_filter_config(x) for x in get_base_filter_config_paths()]
980)
981def test_base_filter_yaml_schema(filter_config):
982 logic_list = [block["logic"] for block in filter_config if block["logic"]]
983 how_list = [block["how"] for block in filter_config]
984 exclude_terms = sum(
985 [block["terms"] for block in filter_config if block["how"] == "exclude"], []
986 )
987
988 assert all(
989 [
990 (x == "any" and y == "include") or (x == "all" and y == "exclude")
991 for x, y in zip(logic_list, how_list)
992 ]
993 ), "logic=any is not allowed for how=exclude"
994 assert all(["*" not in x for x in exclude_terms]), (
995 "wildcard is not allowed in exclude blocks"
996 )
997 assert validate_schema_block_order(filter_config), (
998 "Exclude block found before an include block"
999 )
1000
1001
1002def apply_base_filter(field_text, field_name, filter_config) -> bool:
1003 """Applies the YAML filter to the given text."""
1004
1005 ret_val = False # Default to failing if no filter logic is defined.
1006
1007 for block in filter_config:
1008 if block["field"] == field_name:
1009 # check if the text is in the any, include block for that field
1010 if block["logic"] == "any" and block["how"] == "include":
1011 # simple compare
1012 if not ret_val:
1013 ret_val = any(
1014 [
1015 x.lower() in field_text.lower()
1016 for x in block["terms"]
1017 if "*" not in x
1018 ]
1019 )
1020
1021 # use regex for wildcard compare
1022 if not ret_val:
1023 ret_val = any(
1024 [
1025 re.findall(
1026 x.replace(" * ", ".*").lower(), field_text.lower()
1027 )
1028 for x in block["terms"]
1029 if "*" in x
1030 ]
1031 )
1032
1033 # check if the text is in the all, exclude block for that field.
1034 # all, exclude logic will override any matching includes
1035 if ret_val:
1036 if block["logic"] == "all" and block["how"] == "exclude":
1037 ret_val = all(
1038 [x.lower() not in field_text.lower() for x in block["terms"]]
1039 )
1040
1041 return ret_val
1042
1043
1044@pytest.mark.parametrize(
1045 "test_constant,filter_config",
1046 [(DESIRED_FAIL_APPLIED_EMAIL_FILTER_SUBJECT, APPLIED_FILTER_PATH)],
1047)
1048def test_apply_email_filter_subject_fail(test_constant, filter_config):
1049 """
1050 Tests if the desired subject pairs in test_constants will fail the filter
1051 """
1052 filter_config = load_filter_config(APPLIED_FILTER_PATH)
1053
1054 result_list = []
1055 for subject_text in test_constant:
1056 result = apply_base_filter(subject_text, "subject", filter_config)
1057 result_list.append(result)
1058
1059 assert not any(result_list), (
1060 f"These subject pairs failed to fail: {[x for x, y in list(zip(test_constant, result_list)) if y]}"
1061 )
1062
1063
1064@pytest.mark.parametrize(
1065 "test_constant,filter_config",
1066 [(DESIRED_PASS_APPLIED_EMAIL_FILTER_FROM, APPLIED_FILTER_PATH)],
1067)
1068def test_apply_email_filter_from_pass(test_constant, filter_config):
1069 """
1070 Tests if the desired from pairs in test_constants will pass the filter
1071 """
1072 filter_config = load_filter_config(APPLIED_FILTER_PATH)
1073
1074 result_list = []
1075 for from_text in test_constant:
1076 result = apply_base_filter(from_text, "from", filter_config)
1077 result_list.append(result)
1078
1079 assert all(result_list), (
1080 f"These from pairs failed to pass: {[x for x, y in list(zip(test_constant, result_list)) if not y]}"
1081 )
1082
1083
1084@pytest.mark.parametrize(
1085 "test_constant,filter_config",
1086 [(DESIRED_FAIL_APPLIED_EMAIL_FILTER_SUBJECT, APPLIED_FILTER_PATH)],
1087)
1088def test_apply_email_filter_from_fail(test_constant, filter_config):
1089 """
1090 Tests if the desired from pairs in test_constants will fail the filter
1091 """
1092 filter_config = load_filter_config(APPLIED_FILTER_PATH)
1093
1094 result_list = []
1095 for from_text in test_constant:
1096 result = apply_base_filter(from_text, "from", filter_config)
1097 result_list.append(result)
1098
1099 assert not any(result_list), (
1100 f"These from pairs failed to fail: {[x for x, y in list(zip(test_constant, result_list)) if y]}"
1101 )
1102
1103
1104---
1105target_repo/backend/tests/test_filter_utils.py
1106---
1107"""
1108test that the strings produced by filter utils match expectations
1109
1110tests for override filters have not yet been implemented.
1111"""
1112
1113from typing import List, Dict, Union
1114
1115from utils.filter_utils import (
1116 parse_base_filter_config,
1117) # , parse_override_filter_config
1118from tests.test_constants import SAMPLE_FILTER_PATH, EXPECTED_SAMPLE_QUERY_STRING
1119
1120FilterConfigType = List[Dict[str, Union[str, int, bool, list, dict]]]
1121
1122
1123def test_parse_filter_config_against_sample_filter(
1124 filter_path=SAMPLE_FILTER_PATH, expected_query_string=EXPECTED_SAMPLE_QUERY_STRING
1125):
1126 result_str = parse_base_filter_config(filter_path)
1127
1128 # remove white space from expected string for the purpose of comparing
1129 expected_query_string = (
1130 expected_query_string.replace("\n", "").replace("\t", "").replace(" ", "")
1131 )
1132
1133 assert result_str == expected_query_string, (
1134 "result query string doesn't match expected query string"
1135 )
1136
1137
1138---
1139target_repo/backend/tests/routes/__init__.py
1140---
1141
1142
1143---
1144target_repo/backend/tests/routes/conftest.py
1145---
1146from datetime import datetime, timedelta
1147from unittest import mock
1148
1149import pytest
1150from fastapi.testclient import TestClient
1151
1152from db.users import Users
1153import database
1154import main
1155
1156
1157@pytest.fixture
1158def client(db_session):
1159 main.app.dependency_overrides[database.request_session] = lambda: db_session
1160 test_client = TestClient(main.app)
1161
1162 return test_client
1163
1164
1165@pytest.fixture
1166def logged_in_user(db_session, client):
1167 # create user
1168 user = Users(
1169 user_id="123",
1170 user_email="[email protected]",
1171 start_date=datetime(2000, 1, 1),
1172 )
1173 db_session.add(user)
1174 db_session.flush()
1175
1176 # log in
1177 mock_credentials = mock.Mock(
1178 **{
1179 "expiry": datetime.utcnow() + timedelta(seconds=10),
1180 "token": "fake access token",
1181 "to_json.return_value": {"foo": "bar"},
1182 }
1183 )
1184 mock_decoded_token = {"sub": user.user_id, "email": user.user_email}
1185 with (
1186 mock.patch(
1187 "routes.auth_routes.Flow",
1188 **{"from_client_secrets_file.return_value.credentials": mock_credentials},
1189 ),
1190 mock.patch(
1191 "utils.auth_utils.id_token",
1192 **{"verify_oauth2_token.return_value": mock_decoded_token},
1193 ),
1194 ):
1195 auth_resp = client.get("/login", params={"code": "abc"}, follow_redirects=False)
1196 assert auth_resp.status_code == 303
1197 assert auth_resp.headers["Location"] == "http://localhost:3000/dashboard"
1198
1199 return user
1200
1201
1202---
1203target_repo/backend/tests/routes/test_email_routes.py
1204---
1205from utils import auth_utils
1206from unittest import mock
1207from datetime import datetime
1208
1209from fastapi import Request
1210from sqlalchemy.orm import Session
1211from google.oauth2.credentials import Credentials
1212
1213from db.users import Users
1214from db.processing_tasks import TaskRuns, FINISHED, STARTED
1215from routes.email_routes import fetch_emails_to_db
1216
1217
1218def test_processing(db_session, client, logged_in_user):
1219 db_session.add(TaskRuns(user=logged_in_user, status=STARTED))
1220 db_session.flush()
1221
1222 # make request to check on processing status
1223 resp = client.get("/processing", follow_redirects=False)
1224
1225 # assert response
1226 assert resp.status_code == 200, resp.headers
1227 assert resp.json()["processed_emails"] == 0
1228
1229
1230def test_processing_404(db_session, client, logged_in_user):
1231 resp = client.get("/processing", follow_redirects=False)
1232 assert resp.status_code == 404
1233
1234
1235def test_fetch_emails_to_db(db_session: Session):
1236 test_user_id = "123"
1237
1238 db_session.add(
1239 Users(
1240 user_id=test_user_id,
1241 user_email="[email protected]",
1242 start_date=datetime(2000, 1, 1),
1243 )
1244 )
1245 db_session.commit()
1246
1247 with mock.patch("routes.email_routes.get_email_ids"):
1248 fetch_emails_to_db(
1249 auth_utils.AuthenticatedUser(Credentials("abc")),
1250 Request({"type": "http", "session": {}}),
1251 user_id=test_user_id,
1252 )
1253
1254 task_run = db_session.get(TaskRuns, test_user_id)
1255 assert task_run.status == FINISHED
1256
1257
1258def test_fetch_emails_to_db_in_progress_rate_limited_no_processing(db_session: Session):
1259 test_user_id = "123"
1260
1261 user = Users(
1262 user_id=test_user_id,
1263 user_email="[email protected]",
1264 start_date=datetime(2000, 1, 1),
1265 )
1266 db_session.add(user)
1267 db_session.add(TaskRuns(user=user, status=STARTED))
1268 db_session.commit()
1269
1270 with mock.patch("routes.email_routes.get_email_ids") as mock_get_email_ids:
1271 fetch_emails_to_db(
1272 auth_utils.AuthenticatedUser(Credentials("abc")),
1273 Request({"type": "http", "session": {}}),
1274 user_id=test_user_id,
1275 )
1276
1277 mock_get_email_ids.assert_not_called()
1278 task_run = db_session.get(TaskRuns, test_user_id)
1279 assert task_run.status == STARTED
1280
1281
1282---
1283target_repo/backend/utils/auth_utils.py
1284---
1285import logging
1286import uuid
1287
1288from utils.file_utils import get_user_filepath
1289
1290from google.oauth2.credentials import Credentials
1291from google.auth.transport.requests import Request
1292from google.oauth2 import id_token
1293
1294from utils.config_utils import get_settings
1295
1296logger = logging.getLogger(__name__)
1297
1298settings = get_settings()
1299
1300
1301class AuthenticatedUser:
1302 """
1303 The AuthenticatedUser class is used to
1304 store information about the user. This
1305 class is instantiated after the user has
1306 successfully authenticated with Google.
1307 """
1308
1309 def __init__(self, creds: Credentials, start_date=None):
1310 self.creds = creds
1311 self.user_id, self.user_email = self.get_user_id_and_email()
1312 self.filepath = get_user_filepath(self.user_id)
1313 self.start_date = start_date
1314
1315 def get_user_id_and_email(self) -> tuple:
1316 """
1317 Retrieves the user ID and email from Google OAuth2 credentials.
1318
1319 Parameters:
1320
1321 Returns:
1322 - user_id: The unique user ID.
1323 - email: The user's email address.
1324 """
1325 try:
1326 logger.info("Verifying ID token...")
1327
1328 # Ensure we have an ID token
1329 if not self.creds.id_token:
1330 logger.warning("ID token is missing, trying to refresh credentials...")
1331 self.creds.refresh(Request()) # Refresh credentials
1332
1333 # If still missing, raise an error
1334 if not self.creds.id_token:
1335 raise ValueError("No ID token available after refresh.")
1336
1337 decoded_token = id_token.verify_oauth2_token(
1338 self.creds.id_token, Request(), audience=settings.GOOGLE_CLIENT_ID
1339 )
1340 user_id = decoded_token["sub"] # 'sub' is the unique user ID
1341 user_email = decoded_token.get("email") # 'email' is the user's email address
1342 return user_id, user_email
1343
1344 except (KeyError, TypeError):
1345 self.creds = self.creds.refresh(Request())
1346 if not self.creds.id_token:
1347 proxy_user_id = str(uuid.uuid4())
1348 logger.error(
1349 "Could not retrieve user ID. Using proxy ID: %s", proxy_user_id
1350 )
1351 return proxy_user_id, None # Generate a random ID and return None for email
1352 if not hasattr(self, "_retry"):
1353 self._retry = True
1354 return self.get_user_id_and_email()
1355 else:
1356 proxy_user_id = str(uuid.uuid4())
1357 logger.error(
1358 "Could not retrieve user ID after retry. Using proxy ID: %s",
1359 proxy_user_id,
1360 )
1361 return proxy_user_id, None # Generate a random ID and return None for email
1362 except Exception as e:
1363 logger.error("Error verifying ID token: %s", e)
1364 proxy_user_id = str(uuid.uuid4())
1365 logger.error("Could not verify ID token. Using proxy ID: %s", proxy_user_id)
1366 return proxy_user_id, None # Generate a random ID and return None for email
1367
1368
1369---
1370target_repo/backend/utils/config_utils.py
1371---
1372from functools import lru_cache
1373import config
1374
1375
1376@lru_cache
1377def get_settings():
1378 return config.Settings()
1379
1380
1381---
1382target_repo/backend/utils/cookie_utils.py
1383---
1384from fastapi import Response
1385from utils.config_utils import get_settings
1386
1387settings = get_settings()
1388
1389
1390def set_conditional_cookie(
1391 response: Response,
1392 key: str,
1393 value: str,
1394 max_age: int = 3600, # 1 hour
1395 path: str = "/",
1396 httponly: bool = True,
1397):
1398 """Helper function to set cookies with environment-appropriate settings"""
1399 cookie_params = {
1400 "key": key,
1401 "value": value,
1402 "max_age": max_age,
1403 "path": path,
1404 "httponly": httponly,
1405 }
1406
1407 # Add environment-specific parameters
1408 if settings.is_publicly_deployed:
1409 cookie_params.update(
1410 {"domain": settings.ORIGIN, "secure": True, "samesite": "Strict"}
1411 )
1412 else:
1413 cookie_params.update({"secure": False, "samesite": "Lax"})
1414
1415 # Apply cookie prefixes for additional security
1416 if cookie_params["secure"]:
1417 if cookie_params["path"] == "/" and "domain" not in cookie_params:
1418 cookie_params["key"] = f"__Host-{cookie_params['key']}"
1419 else:
1420 cookie_params["key"] = f"__Secure-{cookie_params['key']}"
1421
1422 response.set_cookie(**cookie_params)
1423 return response
1424
1425
1426---
1427target_repo/backend/utils/email_utils.py
1428---
1429import base64
1430import email
1431import logging
1432import re
1433from typing import Dict, Any
1434
1435from bs4 import BeautifulSoup
1436from email_validator import validate_email, EmailNotValidError
1437
1438from constants import GENERIC_ATS_DOMAINS
1439
1440logger = logging.getLogger(__name__)
1441
1442
1443def clean_whitespace(text: str) -> str:
1444 """
1445 remove \n, \r, and \t from strings
1446 """
1447 return text.replace("\n", "").replace("\r", "").replace("\t", "")
1448
1449
1450def is_automated_email(email: str) -> bool:
1451 """
1452 Determines if an email address is automated or from a person.
1453
1454 Parameters:
1455 email (str): The email address to classify.
1456
1457 Returns:
1458 bool: True if automated, False otherwise.
1459 """
1460 # Define patterns for common automated prefixes and domains
1461 automated_patterns = [
1462 r"^no[-_.]?reply@", # Matches "no-reply", "no_reply", "noreply"
1463 r"^do[-_.]?not[-_.]?reply@", # Matches "do-not-reply", "do_not_reply"
1464 r"^notifications@", # Matches "notifications@"
1465 r"^team@", # Matches "team@"
1466 r"^hello@", # Matches "hello@" (often automated)
1467 r"@smartrecruiters\.com$", # Matches specific automated domains
1468 ]
1469
1470 # Check against the patterns
1471 for pattern in automated_patterns:
1472 if re.search(pattern, email, re.IGNORECASE):
1473 return True # It's an automated email
1474
1475 return False # It's likely from a person
1476
1477
1478def is_valid_email(email: str) -> bool:
1479 try:
1480 validate_email(email)
1481 return True
1482 except EmailNotValidError as e:
1483 # email is not valid, exception message is human-readable
1484 print(str(e))
1485 return False
1486
1487
1488def get_email_content(email_data: Dict[str, Any]) -> str:
1489 """
1490 parses html content of email data and appends it to text content and subject conent
1491
1492 Note 1: linkedIn easy apply messages have *different* html and text_content, so we need to keep both
1493 Note 2: some automated emails only contain the information about the company in the subject and
1494 not the email body, so we need to append this to make sure the email processor gets to see it.
1495
1496 """
1497 text_content = email_data["subject"]
1498
1499 if email_data["text_content"]:
1500 text_content += "\n"
1501 text_content += email_data["text_content"]
1502
1503 if email_data["html_content"]:
1504 soup = BeautifulSoup(email_data["html_content"], "html.parser")
1505 html_content = soup.get_text(separator=" ", strip=True)
1506
1507 text_content += "\n"
1508 text_content += html_content
1509
1510 return text_content
1511
1512
1513def get_email(message_id: str, gmail_instance=None):
1514 if gmail_instance:
1515 try:
1516 message = (
1517 gmail_instance.users()
1518 .messages()
1519 .get(userId="me", id=message_id, format="raw")
1520 .execute()
1521 )
1522 msg_str = base64.urlsafe_b64decode(message["raw"].encode("ASCII")).decode(
1523 "utf-8"
1524 )
1525 mime_msg = email.message_from_string(msg_str)
1526 # logger.info("mime_msg: %s", mime_msg)
1527 # logger.info("msg_str: %s", msg_str)
1528 email_data = {
1529 "id": message_id,
1530 "threadId": message.get("threadId", None),
1531 "from": None,
1532 "to": None,
1533 "subject": None,
1534 "date": None,
1535 "text_content": None,
1536 "html_content": None,
1537 }
1538
1539 # Getting email headers
1540 email_data["from"] = clean_whitespace(mime_msg.get("From"))
1541 email_data["to"] = clean_whitespace(mime_msg.get("To"))
1542 email_data["subject"] = clean_whitespace(mime_msg.get("Subject"))
1543 email_data["date"] = mime_msg.get("Date")
1544
1545 # Extract body of the email
1546 if mime_msg.is_multipart():
1547 for part in mime_msg.walk():
1548 content_type = part.get_content_type()
1549 content_disposition = str(part.get("Content-Disposition"))
1550 if (
1551 content_type == "text/plain"
1552 and "attachment" not in content_disposition
1553 ):
1554 email_data["text_content"] = part.get_payload(
1555 decode=True
1556 ).decode(encoding="utf-8", errors="ignore")
1557 elif (
1558 content_type == "text/html"
1559 and "attachment" not in content_disposition
1560 ):
1561 email_data["html_content"] = part.get_payload(
1562 decode=True
1563 ).decode(encoding="utf-8", errors="ignore")
1564 else:
1565 content_type = mime_msg.get_content_type()
1566 if content_type == "text/plain":
1567 email_data["text_content"] = mime_msg.get_payload(
1568 decode=True
1569 ).decode(encoding="utf-8", errors="ignore")
1570 elif content_type == "text/html":
1571 email_data["html_content"] = mime_msg.get_payload(
1572 decode=True
1573 ).decode(encoding="utf-8", errors="ignore")
1574
1575 email_data["raw_text_content"] = email_data["text_content"]
1576 email_data["text_content"] = get_email_content(email_data)
1577
1578 return email_data
1579
1580 except Exception as e:
1581 logger.exception(f"Error retrieving email with id {message_id}: {e}")
1582 return {}
1583 return {}
1584
1585
1586def get_email_ids(query: tuple = None, gmail_instance=None):
1587 email_ids = []
1588 page_token = None
1589
1590 while True:
1591 response = (
1592 gmail_instance.users()
1593 .messages()
1594 .list(
1595 userId="me",
1596 q=query,
1597 includeSpamTrash=True,
1598 pageToken=page_token,
1599 )
1600 .execute()
1601 )
1602
1603 if "messages" in response:
1604 email_ids.extend(response["messages"])
1605
1606 page_token = response.get("nextPageToken")
1607 if not page_token:
1608 break
1609
1610 return email_ids
1611
1612
1613def get_email_payload(msg):
1614 return msg.get("payload", None)
1615
1616
1617def get_email_headers(msg):
1618 email_data = get_email_payload(msg)
1619 if email_data:
1620 return email_data.get("headers", None)
1621 return None
1622
1623
1624def get_email_parts(msg):
1625 email_data = get_email_payload(msg)
1626 if email_data:
1627 return email_data.get("parts", None)
1628 return None
1629
1630
1631def get_email_subject_line(msg):
1632 try:
1633 email_headers = get_email_headers(msg)
1634 if email_headers:
1635 for header in email_headers:
1636 key = header.get("name")
1637 if key == "Subject":
1638 return header.get("value", "")
1639 except Exception as e:
1640 logger.error("Error getting email subject line: %s", e)
1641 return ""
1642
1643
1644def get_last_capitalized_words_in_line(line):
1645 try:
1646 words = line.split()
1647 last_capitalized_words = []
1648 for word in reversed(words):
1649 if word[0].isupper():
1650 last_capitalized_words.append(word)
1651 else:
1652 break
1653 return " ".join(reversed(last_capitalized_words))
1654 except Exception as e:
1655 logger.error("Error getting last capitalized words in email subject: %s", e)
1656 return ""
1657
1658
1659def get_email_from_address(msg):
1660 try:
1661 email_headers = get_email_headers(msg)
1662 if email_headers:
1663 for header in email_headers:
1664 if header.get("name") == "From":
1665 # if value enclosed in <> then extract email address
1666 # else return the value as is
1667 from_address = header.get("value")
1668 if "<" in from_address:
1669 return from_address.split("<")[1].split(">")[0]
1670 return from_address
1671 except Exception as e:
1672 logger.error("Error getting email from address: %s", e)
1673 return ""
1674
1675
1676def get_received_at_timestamp(message_id, msg):
1677 import datetime
1678
1679 try:
1680 email_headers = get_email_headers(msg)
1681 if email_headers:
1682 for header in email_headers:
1683 key = header.get("name")
1684 if key == "Date":
1685 return header.get("value")
1686 except Exception as e:
1687 print("msg_%s: %s" % (message_id, e))
1688 return datetime.datetime.now() # default if trouble parsing
1689
1690
1691def is_generic_email_domain(domain):
1692 # input expects return value of get_email_domain_from_address
1693 return domain in GENERIC_ATS_DOMAINS
1694
1695
1696def get_email_domain_from_address(email_address):
1697 return email_address.split("@")[1] if "@" in email_address else ""
1698
1699
1700def clean_email(email_body: str) -> list:
1701 import spacy
1702 from spacy_cleaner import processing, Cleaner
1703
1704 try:
1705 model = spacy.load("en_core_web_sm")
1706 pipeline = Cleaner(
1707 model,
1708 processing.remove_stopword_token,
1709 processing.remove_punctuation_token,
1710 processing.remove_number_token,
1711 )
1712 return pipeline.clean([email_body])
1713 except Exception as e:
1714 logger.error("Error cleaning email: %s", e)
1715 return []
1716
1717
1718def get_word_frequency(cleaned_email):
1719 try:
1720 word_dict = {}
1721 for word in cleaned_email[0].split(" "):
1722 if word not in word_dict:
1723 word_dict[word] = 1
1724 else:
1725 word_dict[word] += 1
1726
1727 word_dict_sorted = sorted(
1728 word_dict.items(), key=lambda item: item[1], reverse=True
1729 )
1730 return word_dict_sorted
1731 except Exception as e:
1732 logger.error("Error getting word frequency: %s", e)
1733 return []
1734
1735
1736def get_top_word_in_email_body(msg_id, msg):
1737 try:
1738 parts = get_email_parts(msg)
1739 if parts:
1740 for part in parts:
1741 if part.get("mimeType") not in [
1742 "text/plain",
1743 "text/html",
1744 ]:
1745 continue
1746 if part.get("mimeType") and part.get("mimeType") in [
1747 "text/plain",
1748 "text/html",
1749 ]:
1750 data = base64.urlsafe_b64decode(
1751 part.get("body", {}).get("data", {})
1752 ).decode("utf-8")
1753 # Parse the content with BeautifulSoup
1754 soup = BeautifulSoup(data, "html.parser")
1755 # Extract the plain text from the HTML content
1756 email_text = soup.get_text()
1757 cleaned_text = clean_email(email_text)
1758
1759 if cleaned_text:
1760 word_frequency = get_word_frequency(cleaned_text)
1761 top_capitalized_word = get_top_consecutive_capitalized_words(
1762 word_frequency
1763 )
1764 if not top_capitalized_word:
1765 if len(cleaned_text) > 0:
1766 try:
1767 return cleaned_text[0][0]
1768 except IndexError:
1769 return cleaned_text[0]
1770 return top_capitalized_word
1771 except Exception as e:
1772 logger.error("Error getting top word: %s", e)
1773 return ""
1774
1775
1776def get_company_name(id, msg, subject_line):
1777 try:
1778 top_word = get_top_word_in_email_body(id, msg)
1779 from_address = get_email_from_address(msg)
1780 domain = get_email_domain_from_address(from_address)
1781 if not top_word or top_word[0].islower():
1782 # no top word, or top word is not capitalized
1783 if is_generic_email_domain(domain):
1784 # if generic ATS domain like workday, greenhouse, etc.,
1785 # check the last capitalized word(s) in the subject line
1786 return get_last_capitalized_words_in_line(subject_line) or ""
1787 return domain.split(".")[0]
1788 return top_word
1789 except Exception as e:
1790 logger.error("Error getting company name: %s", e)
1791 return ""
1792
1793
1794def get_top_consecutive_capitalized_words(tuples_list):
1795 """
1796 Helper function to parse company name from an email.
1797 We only want the top capitalized words that appear consecutively and with the same frequency.
1798 """
1799 try:
1800 result = []
1801 temp_group = []
1802 max = float("-inf")
1803 for i, (first, second) in enumerate(tuples_list):
1804 is_capitalized = first and first[0].isupper()
1805
1806 if is_capitalized:
1807 if not temp_group:
1808 max = second
1809 temp_group.append((first, second))
1810 if temp_group and temp_group[-1][1] == second:
1811 # Add to the current group if criteria match
1812 temp_group.append((first, second))
1813 if second < max:
1814 break
1815 result.append(first)
1816 return " ".join(result)
1817 except Exception as e:
1818 logger.error("Error getting top consecutive capitalized words: %s", e)
1819 return ""
1820
1821
1822---
1823target_repo/backend/utils/file_utils.py
1824---
1825def get_user_filepath(user_id: str) -> str:
1826 """
1827 Each user has their own directory to store their data.
1828 """
1829 return f"users/{user_id}"
1830
1831
1832---
1833target_repo/backend/utils/filter_utils.py
1834---
1835import yaml
1836
1837
1838def parse_simple(term: str, field: str, exclude: bool = False) -> str:
1839 """
1840 Parses a simple combination of search field and search term into a gmail search string.
1841 If exclude is true, a "-" character is prepended to the field.
1842
1843 Args:
1844 term (str): list of terms to parse
1845 field (str): field to search
1846 exclude (bool): whether to exclude the terms
1847 """
1848 if field == "body":
1849 field_str = ""
1850 else:
1851 field_str = f"{field}:"
1852
1853 if exclude:
1854 out_str = f'-{field_str}"{term}"'
1855 else:
1856 out_str = f'{field_str}"{term}"'
1857
1858 return out_str
1859
1860
1861def parse_wildcard(term: str, field: str, exclude: bool = False) -> str:
1862 """
1863 The wildcard * is convenient to use in a yaml file, but it is
1864 not supported by the Gmail API. This function will parse
1865 any number of wildcards as ({field}: "{term1}" AND {field}: "{term2}" AND ...)
1866
1867 If exclude is true, a "-" character is prepended to the field.
1868
1869 Args:
1870 term (str): list of terms to parse
1871 field (str): field to search
1872 exclude (bool): whether to exclude the terms
1873 """
1874 if field == "body":
1875 field_str = ""
1876 else:
1877 field_str = f"{field}:"
1878
1879 if exclude:
1880 sub_terms = term.split(" * ")
1881 out_str = "(" + " AND ".join([f'-{field_str}"{x}"' for x in sub_terms]) + ")"
1882
1883 else:
1884 sub_terms = term.split(" * ")
1885 out_str = "(" + " AND ".join([f'{field_str}"{x}"' for x in sub_terms]) + ")"
1886
1887 return out_str
1888
1889
1890def parse_base_filter_config(filter_path: str) -> str:
1891 with open(filter_path, "r") as fid:
1892 data = yaml.safe_load(fid)
1893
1894 filter_str = ""
1895 for block in data:
1896 sub_filter_str = ""
1897 if block["logic"] == "any":
1898 operator = " OR "
1899 elif block["logic"] == "all":
1900 operator = " AND "
1901
1902 # parse each item based on schema logic
1903 simple_filters = []
1904 wildcard_any_filters = []
1905 if block["how"] == "include":
1906 simple_filters += [
1907 parse_simple(x, block["field"], exclude=False)
1908 for x in block["terms"]
1909 if "*" not in x
1910 ]
1911 wildcard_any_filters += [
1912 parse_wildcard(x, block["field"], exclude=False)
1913 for x in block["terms"]
1914 if "*" in x
1915 ]
1916 if block["how"] == "exclude":
1917 simple_filters += [
1918 parse_simple(x, block["field"], exclude=True) for x in block["terms"]
1919 ]
1920
1921 # join with appropriate operator
1922 if simple_filters + wildcard_any_filters:
1923 sub_filter_str = operator.join(simple_filters + wildcard_any_filters)
1924
1925 # if this isn't the first item then we need to add an extra operator in from
1926 if sub_filter_str:
1927 if len(filter_str) > 0:
1928 sub_filter_str = operator + sub_filter_str
1929 filter_str += sub_filter_str
1930
1931 filter_str = "(" + filter_str + ")"
1932
1933 return filter_str
1934
1935
1936def parse_override_filter_config(filter_path: str):
1937 """not implemented"""
1938 with open(filter_path, "r") as fid:
1939 data = yaml.safe_load(fid)
1940
1941 filter_str_list = []
1942 for block in data:
1943 simple_filters = []
1944 for sub_block in block:
1945 include_terms = sub_block["include_terms"]
1946 exclude_terms = sub_block["exclude_terms"]
1947
1948 # parse each item based on schema logic
1949 if include_terms is not None:
1950 simple_filters += [
1951 parse_simple(x, sub_block["field"], exclude=False)
1952 for x in sub_block["include_terms"]
1953 ]
1954 if exclude_terms is not None:
1955 simple_filters += [
1956 parse_simple(x, sub_block["field"], exclude=True)
1957 for x in sub_block["exclude_terms"]
1958 ]
1959
1960 # join with an AND operator
1961 if simple_filters:
1962 filter_str_list.append("(" + " AND ".join(simple_filters) + ")")
1963
1964 filter_str = "(" + " OR ".join(filter_str_list) + ")"
1965
1966 return filter_str
1967
1968
1969---
1970target_repo/backend/utils/llm_utils.py
1971---
1972import google.generativeai as genai
1973import time
1974import json
1975from google.ai.generativelanguage_v1beta2 import GenerateTextResponse
1976import logging
1977
1978from utils.config_utils import get_settings
1979
1980settings = get_settings()
1981
1982# Configure Google Gemini API
1983genai.configure(api_key=settings.GOOGLE_API_KEY)
1984model = genai.GenerativeModel("gemini-2.0-flash-lite")
1985logger = logging.getLogger(__name__)
1986logging.basicConfig(
1987 level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
1988)
1989
1990logging.basicConfig(
1991 level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
1992)
1993
1994def process_email(email_text):
1995 prompt = f"""
1996 Extract the company name, job application status, and job title (role) from the following email.
1997
1998 Given the content of an email related to job applications or recruitment, assign one of the following labels to job application status based on the main purpose or outcome of the message:
1999
2000 Application confirmation
2001 Rejection
2002 Availability request
2003 Information request
2004 Assessment sent
2005 Interview invitation
2006 Did not apply - inbound request
2007 Action required from company
2008 Hiring freeze notification
2009 Withdrew application
2010 Offer made
2011 False positive, not related to job search
2012 Informational outreach
2013
2014 Labeling Rules and Explanations for Job Application Status:
2015
2016 Application confirmation
2017 Assign this label if the email confirms receipt of a job application.
2018 Examples: "We have received your application", "Thank you for applying", "Your application has been submitted".
2019
2020 Rejection
2021 Use this label for emails explicitly stating that the candidate is not moving forward in the process.
2022 Examples: "We regret to inform you...", "We will not be proceeding with your application", "You have not been selected".
2023
2024 Availability request
2025 Assign this label if the company asks for your availability for a call, interview, or meeting.
2026 Examples: "Please let us know your availability", "When are you free for a call?", "Can you share your available times?"
2027
2028 Information request
2029 Use this label if the company requests additional information, documents, or clarification.
2030 Examples: "Please send your portfolio", "Can you provide references?", "We need more information about..."
2031
2032 Assessment sent
2033 Assign this label if the company sends a test, assignment, or assessment for you to complete as part of the hiring process.
2034 Examples: "Please complete the attached assessment", "Here is your coding challenge", "Take-home assignment enclosed".
2035
2036 Interview invitation
2037 Use this label if the company invites you to an interview (phone, video, or onsite).
2038 Examples: "We would like to invite you to interview", "Interview scheduled", "Please join us for an interview".
2039
2040 Did not apply - inbound request
2041 Assign this label if the company or recruiter reaches out to you first, and you did not apply for the position.
2042 Examples: "We found your profile and would like to connect", "Are you interested in this opportunity?", "We came across your resume".
2043
2044 Action required from company
2045 Use this label if the next step is pending from the company, and you are waiting for their response or action.
2046 Examples: "We will get back to you", "Awaiting feedback from the team", "We will contact you with next steps".
2047
2048 Hiring freeze notification
2049 Assign this label if the company notifies you that the position is on hold or canceled due to a hiring freeze.
2050 Examples: "Position is on hold", "Hiring freeze in effect", "We are pausing recruitment".
2051
2052 Withdrew application
2053 Use this label if you (the candidate) have withdrawn your application, or the email confirms your withdrawal.
2054 Examples: "You have withdrawn your application", "Thank you for letting us know you are no longer interested".
2055
2056 Offer made
2057 Assign this label if the company extends a job offer to you.
2058 Examples: "We are pleased to offer you the position", "Offer letter attached", "Congratulations, you have been selected".
2059
2060 False positive, not related to job search
2061 Use this label if the email is not related to job applications, recruitment, or hiring.
2062 Examples: Newsletters, spam, unrelated notifications, or personal emails.
2063
2064 Informational outreach
2065 Assign this label if the company or recruiter is reaching out to share information, updates, or opportunities, but not in direct response to an application or as an explicit invitation to apply.
2066 Examples: "We wanted to let you know about upcoming roles", "Here’s information about our company", "General outreach about our hiring process".
2067
2068 Provide the output in JSON format, for example: "company_name": "company_name", "job_application_status": "status", "job_title": "job_title"
2069 Remove backticks. Only use double quotes. Enclose key and value pairs in a single pair of curly braces.
2070 Email: {email_text}
2071 """
2072
2073 retries = 3 # Max retries
2074 delay = 60 # Initial delay
2075 for attempt in range(retries):
2076 try:
2077 logger.info("Calling generate_content")
2078 response: GenerateTextResponse = model.generate_content(prompt)
2079 response.resolve()
2080 response_json: str = response.text
2081 logger.info("Received response from model: %s", response_json)
2082 if response_json:
2083 cleaned_response_json = (
2084 response_json.replace("json", "")
2085 .replace("`", "")
2086 .replace("'", '"')
2087 .strip()
2088 )
2089 cleaned_response_json = (
2090 response_json.replace("json", "")
2091 .replace("`", "")
2092 .replace("'", '"')
2093 .strip()
2094 )
2095 logger.info("Cleaned response: %s", cleaned_response_json)
2096 return json.loads(cleaned_response_json)
2097 else:
2098 logger.error("Empty response received from the model.")
2099 return None
2100 except Exception as e:
2101 if "429" in str(e):
2102 logger.warning(
2103 f"Rate limit hit. Retrying in {delay} seconds (attempt {attempt + 1})."
2104 )
2105 time.sleep(delay)
2106 else:
2107 logger.error(f"process_email exception: {e}")
2108 return None
2109 logger.error(f"Failed to process email after {retries} attempts.")
2110 return None
2111
2112
2113
2114---
2115target_repo/backend/db/companies.py
2116---
2117from sqlmodel import SQLModel, Field, UniqueConstraint
2118
2119
2120class Companies(SQLModel, table=True):
2121 __tablename__ = "companies"
2122 company_id: int = Field(default=None, primary_key=True)
2123 company_name: str
2124 company_email_domain: str
2125
2126 __table_args__ = (
2127 # Ensure that company_name and company_email_domain together are unique
2128 UniqueConstraint(
2129 "company_name",
2130 "company_email_domain",
2131 name="unique_company_name_and_domain",
2132 ),
2133 )
2134
2135
2136---
2137target_repo/backend/db/company_jobs.py
2138---
2139from sqlmodel import SQLModel, Field, UniqueConstraint
2140from datetime import datetime
2141
2142
2143class CompanyJobs(SQLModel, table=True):
2144 __tablename__ = "company_jobs"
2145 company_job_id: int = Field(default=None, primary_key=True)
2146 company_id: int = Field(foreign_key="companies.company_id", nullable=False)
2147 company_job_title_id: int | None = Field(
2148 default=None, foreign_key="job_titles.job_title_id", nullable=True
2149 )
2150 company_job_description: str | None = Field(default=None, nullable=True)
2151 company_job_posted_at: datetime = Field(
2152 default_factory=datetime.utcnow, nullable=False
2153 )
2154 company_job_location: str | None = Field(default=None, nullable=True)
2155
2156 __table_args__ = (
2157 # Ensure that company_name and company_email_domain together are unique
2158 UniqueConstraint(
2159 "company_id",
2160 "job_title_id",
2161 "job_location",
2162 "job_posted_at",
2163 name="unique_job",
2164 ),
2165 )
2166
2167
2168---
2169target_repo/backend/db/job_status.py
2170---
2171from sqlmodel import SQLModel, Field
2172
2173
2174class JobStatus(SQLModel, table=True):
2175 __tablename__ = "job_statuses"
2176 status_id: int = Field(default=None, primary_key=True)
2177 status_name: str
2178 status_description: str
2179
2180
2181---
2182target_repo/backend/db/job_titles.py
2183---
2184from sqlmodel import SQLModel, Field, UniqueConstraint
2185
2186
2187class JobTitles(SQLModel, table=True):
2188 __tablename__ = "job_titles"
2189 job_title_id: int = Field(default=None, primary_key=True)
2190 job_title: str
2191
2192 __table_args__ = (UniqueConstraint("job_title", name="unique_job_title"),)
2193
2194
2195---
2196target_repo/backend/db/processing_tasks.py
2197---
2198from sqlmodel import Field, SQLModel, Relationship
2199from datetime import datetime, timezone
2200import sqlalchemy as sa
2201from db.users import Users
2202
2203FINISHED = "finished"
2204STARTED = "started"
2205
2206
2207class TaskRuns(SQLModel, table=True):
2208 __tablename__ = "processing_task_runs"
2209 user_id: str = Field(foreign_key="users.user_id", primary_key=True)
2210 created: datetime = Field(default_factory=datetime.now, nullable=False)
2211 updated: datetime = Field(
2212 sa_column_kwargs={"onupdate": sa.func.now()},
2213 default_factory=lambda: datetime.now(timezone.utc),
2214 nullable=False,
2215 )
2216 status: str = Field(nullable=False)
2217 total_emails: int = 0
2218 processed_emails: int = 0
2219
2220 user: Users = Relationship()
2221
2222
2223---
2224target_repo/backend/db/user_emails.py
2225---
2226from sqlmodel import SQLModel, Field
2227from datetime import datetime
2228
2229class UserEmails(SQLModel, table=True):
2230 __tablename__ = "user_emails"
2231 id: str = Field(primary_key=True) # Gmail email ID (not unique globally)
2232 user_id: str = Field(primary_key=True) # Unique per user (composite key)
2233 company_name: str
2234 application_status: str
2235 received_at: datetime
2236 subject: str
2237 job_title: str
2238 email_from: str # to avoid 'from' being a reserved key word
2239
2240---
2241target_repo/backend/db/user_job_status.py
2242---
2243from sqlmodel import SQLModel, Field
2244
2245
2246class UserJobStatuses(SQLModel, table=True):
2247 __tablename__ = "user_job_statuses"
2248 user_job_status_id: int = Field(default=None, primary_key=True)
2249 user_id: int = Field(foreign_key="users.user_id", nullable=False)
2250 job_id: int = Field(foreign_key="company_jobs.job_id", nullable=False)
2251 status_id: int = Field(foreign_key="job_statuses.status_id", nullable=False)
2252
2253
2254---
2255target_repo/backend/db/user_jobs.py
2256---
2257from sqlmodel import SQLModel, Field
2258from datetime import datetime
2259
2260
2261class UserJobs(SQLModel, table=True):
2262 __tablename__ = "user_jobs"
2263 user_job_id: int = Field(primary_key=True, nullable=False)
2264 user_id: int = Field(foreign_key="users.user_id", nullable=False)
2265 job_id: int = Field(foreign_key="company_jobs.job_id", nullable=False)
2266 applied_at: datetime
2267
2268
2269---
2270target_repo/backend/db/user_session.py
2271---
2272from sqlmodel import SQLModel, Field
2273from uuid import UUID, uuid4
2274from datetime import datetime, timezone
2275from typing import Optional
2276
2277class UserSession(SQLModel, table=True):
2278 __tablename__ = "user_session"
2279 id: UUID = Field(default_factory=uuid4, primary_key=True)
2280 user_id: int = Field(foreign_key="users.user_id")
2281 session_start: datetime = Field(default_factory=datetime.now(timezone.utc))
2282 session_end: Optional[datetime] = None
2283 user_agent: Optional[str] = None
2284
2285
2286---
2287target_repo/backend/db/users.py
2288---
2289from sqlmodel import SQLModel, Field
2290from pydantic import BaseModel
2291from datetime import datetime
2292
2293class UserData(BaseModel):
2294 user_id: str
2295 user_email: str
2296 start_date: datetime
2297
2298class Users(SQLModel, table=True):
2299 __tablename__ = "users"
2300 user_id: str = Field(default = None, primary_key = True)
2301 user_email: str = Field(nullable=False)
2302 start_date: datetime = Field(nullable=False) # Start date for job applications
2303
2304
2305---
2306target_repo/backend/db/utils/user_email_utils.py
2307---
2308from db.user_emails import UserEmails
2309from datetime import datetime, timezone
2310import email.utils
2311import logging
2312from database import engine
2313from sqlmodel import Session, select
2314
2315logger = logging.getLogger(__name__)
2316
2317def parse_email_date(date_str: str) -> datetime:
2318 """
2319 Converts an email date string into a Python datetime object
2320 """
2321 dt = email.utils.parsedate_to_datetime(date_str)
2322 if dt is None:
2323 # default to current UTC datetime
2324 dt = datetime.now(timezone.utc)
2325 return dt
2326
2327
2328def check_email_exists(user_id: str, email_id: str) -> bool:
2329 """
2330 Checks if an email with the given emailId and userId exists in the database.
2331 """
2332 with Session(engine) as session:
2333 statement = select(UserEmails).where(
2334 (UserEmails.user_id == user_id) & (UserEmails.id == email_id)
2335 )
2336 result = session.exec(statement).first()
2337 return result is not None
2338
2339
2340def create_user_email(user, message_data: dict) -> UserEmails:
2341 """
2342 Creates a UserEmail record instance from the provided data.
2343 """
2344 try:
2345 received_at_str = message_data["received_at"]
2346 received_at = parse_email_date(received_at_str) # parse_email_date function was created as different date formats were being pulled from the data
2347 if check_email_exists(user.user_id, message_data["id"]):
2348 logger.info(f"Email with ID {message_data['id']} already exists in the database.")
2349 return None
2350 return UserEmails(
2351 id=message_data["id"],
2352 user_id=user.user_id,
2353 company_name=message_data["company_name"],
2354 application_status=message_data["application_status"],
2355 received_at=received_at,
2356 subject=message_data["subject"],
2357 job_title=message_data["job_title"],
2358 email_from=message_data["from"]
2359 )
2360 except Exception as e:
2361 logger.error(f"Error creating UserEmail record: {e}")
2362 return None
2363
2364
2365---
2366target_repo/backend/db/utils/user_utils.py
2367---
2368import logging
2369from typing import Optional, Tuple
2370from db.user_emails import UserEmails
2371from sqlmodel import Session, select, func
2372from db.users import Users
2373from datetime import datetime, timedelta, timezone
2374
2375logger = logging.getLogger(__name__)
2376
2377def get_last_email_date(user_id: str) -> Optional[datetime]:
2378 from database import engine
2379 """
2380 Checks date of user's most recent email
2381
2382 """
2383 with Session(engine) as session:
2384 row = session.exec(
2385 select(func.max(UserEmails.received_at))
2386 .where(UserEmails.user_id == user_id)
2387 ).one() # aggregates in SQL to a single row
2388 return row
2389
2390def user_exists(user) -> Tuple[bool, Optional[datetime]]:
2391 from database import engine
2392 """
2393 Checks if user is already in the database
2394
2395 """
2396 with Session(engine) as session:
2397 existing_user = session.exec(select(Users).where(Users.user_id == user.user_id)).first()
2398 if not existing_user:
2399 return False, None
2400 else:
2401 last_fetched_date = get_last_email_date(user.user_id)
2402 return True, last_fetched_date
2403
2404def add_user(user, request, start_date=None) -> Users:
2405 """
2406 Writes user data to the users model and session storage
2407
2408 """
2409 from database import engine
2410 with Session(engine) as session:
2411 # Check if the user already exists in the database
2412 existing_user = session.exec(select(Users).where(Users.user_id == user.user_id)).first()
2413
2414 if not existing_user:
2415
2416 start_date = getattr(user, "start_date", None) or (datetime.now(timezone.utc) - timedelta(days=90))
2417
2418 if isinstance(start_date, datetime):
2419 start_date = start_date.strftime("%Y-%m-%d")
2420
2421 # add a new user record
2422 new_user = Users(
2423 user_id=user.user_id,
2424 user_email=user.user_email,
2425 start_date=start_date
2426 )
2427
2428 session.add(new_user)
2429 session.commit()
2430 session.refresh(new_user)
2431 logger.info(f"Created new user record for user_id: {user.user_id}")
2432
2433 # Write start date to session storage
2434 if isinstance(start_date, str):
2435 request.session["start_date"] = start_date # Already a string, no need to convert
2436 else:
2437 request.session["start_date"] = start_date.isoformat() # Convert only if it's a datetime object
2438
2439 return new_user
2440 else:
2441 logger.info(f"User {user.user_id} already exists in the database.")
2442 return existing_user
2443
2444---
2445target_repo/backend/alembic/env.py
2446---
2447from logging.config import fileConfig
2448
2449from sqlalchemy import engine_from_config
2450from sqlalchemy import pool
2451
2452from alembic import context
2453
2454# Import your SQLAlchemy models/metadata
2455import sys
2456import os
2457sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
2458
2459from sqlmodel import SQLModel
2460
2461# This is the Alembic Config object
2462config = context.config
2463
2464# Interpret the config file for Python logging
2465if config.config_file_name is not None:
2466 fileConfig(config.config_file_name)
2467
2468# Set target metadata to SQLModel's metadata since that's likely what your models use
2469target_metadata = SQLModel.metadata
2470
2471# other values from the config, defined by the needs of env.py,
2472# can be acquired:
2473# my_important_option = config.get_main_option("my_important_option")
2474# ... etc.
2475
2476
2477def run_migrations_offline() -> None:
2478 """Run migrations in 'offline' mode."""
2479 url = config.get_main_option("sqlalchemy.url")
2480 context.configure(
2481 url=url,
2482 target_metadata=target_metadata,
2483 literal_binds=True,
2484 dialect_opts={"paramstyle": "named"},
2485 )
2486
2487 with context.begin_transaction():
2488 context.run_migrations()
2489
2490
2491def run_migrations_online() -> None:
2492 """Run migrations in 'online' mode."""
2493 connectable = engine_from_config(
2494 config.get_section(config.config_ini_section),
2495 prefix="sqlalchemy.",
2496 poolclass=pool.NullPool,
2497 )
2498
2499 with connectable.connect() as connection:
2500 context.configure(
2501 connection=connection, target_metadata=target_metadata
2502 )
2503
2504 with context.begin_transaction():
2505 context.run_migrations()
2506
2507
2508if context.is_offline_mode():
2509 run_migrations_offline()
2510else:
2511 run_migrations_online()
2512
2513
2514---
2515target_repo/backend/alembic/versions/6240656d52f6_add_job_title_column.py
2516---
2517"""add_job_title_column
2518
2519Revision ID: 6240656d52f6
2520Revises: b240c664ed46
2521Create Date: 2025-03-16 21:31:17.486275
2522
2523"""
2524from typing import Sequence, Union
2525
2526from alembic import op
2527import sqlalchemy as sa
2528
2529
2530# revision identifiers, used by Alembic.
2531revision: str = '6240656d52f6'
2532down_revision: Union[str, None] = 'b240c664ed46'
2533branch_labels: Union[str, Sequence[str], None] = None
2534depends_on: Union[str, Sequence[str], None] = None
2535
2536
2537def upgrade() -> None:
2538 """Add job_title column to the relevant table."""
2539 op.add_column('user_email', sa.Column('job_title', sa.String(255), nullable=True))
2540
2541
2542
2543def downgrade() -> None:
2544 """Remove job_title column."""
2545 op.drop_column('user_email', 'job_title')
2546
2547---
2548target_repo/backend/alembic/versions/b240c664ed46_change_user_email_id_to_varchar.py
2549---
2550"""change_user_email_id_to_varchar
2551
2552Revision ID: b240c664ed46
2553Revises:
2554Create Date: 2025-03-16 02:58:30.325992
2555
2556"""
2557from typing import Sequence, Union
2558
2559from alembic import op
2560import sqlalchemy as sa
2561from sqlalchemy.dialects import postgresql
2562
2563# revision identifiers, used by Alembic.
2564revision: str = 'b240c664ed46'
2565down_revision: Union[str, None] = None
2566branch_labels: Union[str, Sequence[str], None] = None
2567depends_on: Union[str, Sequence[str], None] = None
2568
2569
2570def upgrade() -> None:
2571 """Change user_email.id column from integer to varchar and create composite primary key."""
2572 # First, drop any constraints that depend on the id column
2573 op.execute('ALTER TABLE user_email DROP CONSTRAINT IF EXISTS user_email_pkey')
2574
2575 # Change the column type
2576 op.alter_column('user_email', 'id',
2577 existing_type=sa.INTEGER(),
2578 type_=sa.VARCHAR(255),
2579 postgresql_using='id::varchar')
2580
2581 # Add composite primary key constraint
2582 op.execute('ALTER TABLE user_email ADD PRIMARY KEY (id, user_id)')
2583
2584
2585def downgrade() -> None:
2586 """Revert to integer id column with appropriate primary key."""
2587 # Drop the composite primary key
2588 op.execute('ALTER TABLE user_email DROP CONSTRAINT IF EXISTS user_email_pkey')
2589
2590 # Change id back to integer (with potential data loss warning if non-numeric ids exist)
2591 op.alter_column('user_email', 'id',
2592 existing_type=sa.VARCHAR(255),
2593 type_=sa.INTEGER(),
2594 postgresql_using='id::integer')
2595
2596 # Restore original primary key on id only
2597 op.execute('ALTER TABLE user_email ADD PRIMARY KEY (id)')
2598
2599
2600---
2601target_repo/backend/alembic/versions/c256d0279ea6_rename_user_email_table_to_plural.py
2602---
2603"""rename_user_email_table_to_plural
2604
2605Revision ID: c256d0279ea6
2606Revises: 6240656d52f6
2607Create Date: 2025-03-17 03:16:53.078420
2608
2609"""
2610from typing import Sequence, Union
2611
2612from alembic import op
2613import sqlalchemy as sa
2614
2615
2616# revision identifiers, used by Alembic.
2617revision: str = 'c256d0279ea6'
2618down_revision: Union[str, None] = '6240656d52f6'
2619branch_labels: Union[str, Sequence[str], None] = None
2620depends_on: Union[str, Sequence[str], None] = None
2621
2622
2623def upgrade() -> None:
2624 """Rename user_email table to user_emails."""
2625 op.rename_table('user_email', 'user_emails')
2626
2627
2628def downgrade() -> None:
2629 """Rename user_emails table back to user_email."""
2630 op.rename_table('user_emails', 'user_email')
2631
2632---
2633target_repo/backend/routes/auth_routes.py
2634---
2635import datetime
2636import logging
2637from fastapi import APIRouter, Depends, HTTPException, Request, BackgroundTasks
2638from fastapi.responses import RedirectResponse, HTMLResponse
2639from google_auth_oauthlib.flow import Flow
2640
2641from db.utils.user_utils import user_exists
2642from utils.auth_utils import AuthenticatedUser
2643from session.session_layer import create_random_session_string, validate_session
2644from utils.config_utils import get_settings
2645from utils.cookie_utils import set_conditional_cookie
2646from routes.email_routes import fetch_emails_to_db
2647from slowapi import Limiter
2648from slowapi.util import get_remote_address
2649
2650limiter = Limiter(key_func=get_remote_address)
2651
2652# Logger setup
2653logger = logging.getLogger(__name__)
2654
2655# Get settings
2656settings = get_settings()
2657
2658# FastAPI router for Google login
2659router = APIRouter()
2660
2661APP_URL = settings.APP_URL
2662
2663@router.get("/login")
2664@limiter.limit("10/minute")
2665async def login(request: Request, background_tasks: BackgroundTasks):
2666 """Handles Google OAuth2 login and authorization code exchange."""
2667 code = request.query_params.get("code")
2668 flow = Flow.from_client_secrets_file(
2669 settings.CLIENT_SECRETS_FILE,
2670 settings.GOOGLE_SCOPES,
2671 redirect_uri=settings.REDIRECT_URI,
2672 )
2673
2674 try:
2675 if not code:
2676 authorization_url, state = flow.authorization_url(prompt="consent")
2677 return RedirectResponse(url=authorization_url)
2678 logger.info("Authorization code received, exchanging for token...")
2679 try:
2680 flow.fetch_token(code=code)
2681 except Exception as e:
2682 logger.error("Failed to fetch token: %s", e)
2683 return RedirectResponse(
2684 url=f"{settings.APP_URL}/errors?message=permissions_error",
2685 status_code=303
2686 )
2687 try:
2688 creds = flow.credentials
2689 except Exception as e:
2690 logger.error("Failed to fetch credentials: %s", e)
2691 return RedirectResponse(
2692 url=f"{settings.APP_URL}/errors?message=credentials_error",
2693 status_code=303
2694 )
2695
2696 if not creds.valid:
2697 creds.refresh(Request())
2698 return RedirectResponse("/login", status_code=303)
2699
2700 user = AuthenticatedUser(creds)
2701 session_id = request.session["session_id"] = create_random_session_string()
2702
2703 # Set session details
2704 try:
2705 token_expiry = creds.expiry.isoformat()
2706 except Exception as e:
2707 logger.error("Failed to parse token expiry: %s", e)
2708 token_expiry = (
2709 datetime.datetime.utcnow() + datetime.timedelta(hours=1)
2710 ).isoformat()
2711
2712 request.session["token_expiry"] = token_expiry
2713 request.session["user_id"] = user.user_id
2714 request.session["creds"] = creds.to_json()
2715 request.session["access_token"] = creds.token
2716
2717 # NOTE: change redirection once dashboard is completed
2718 exists, last_fetched_date = user_exists(user)
2719 if exists:
2720 logger.info("User already exists in the database.")
2721 response = RedirectResponse(
2722 url=f"{settings.APP_URL}/processing", status_code=303
2723 )
2724 background_tasks.add_task(fetch_emails_to_db, user, request, last_fetched_date, user_id=user.user_id)
2725 logger.info("Background task started for user_id: %s", user.user_id)
2726 else:
2727 request.session["is_new_user"] = True
2728 response = RedirectResponse(
2729 url=f"{settings.APP_URL}/dashboard", status_code=303
2730 )
2731 print("User does not exist")
2732
2733 response = set_conditional_cookie(
2734 key="Authorization", value=session_id, response=response
2735 )
2736
2737 return response
2738 except Exception as e:
2739 logger.error("Login error: %s", e)
2740 return HTMLResponse(content="An error occurred, sorry!", status_code=500)
2741
2742
2743@router.get("/logout")
2744async def logout(request: Request, response: RedirectResponse):
2745 logger.info("Logging out")
2746 request.session.clear()
2747 response.delete_cookie(key="__Secure-Authorization")
2748 response.delete_cookie(key="Authorization")
2749 return RedirectResponse(f"{APP_URL}", status_code=303)
2750
2751
2752@router.get("/me")
2753async def getUser(request: Request, user_id: str = Depends(validate_session)):
2754 if not user_id:
2755 raise HTTPException(
2756 status_code=401, detail="No user id found in session"
2757 )
2758 return {"user_id": user_id}
2759
2760---
2761target_repo/backend/routes/email_routes.py
2762---
2763import logging
2764from typing import List, Optional
2765from fastapi import APIRouter, Depends, Request, HTTPException, BackgroundTasks
2766from fastapi.responses import HTMLResponse, JSONResponse, RedirectResponse
2767from sqlmodel import Session, select, desc
2768from googleapiclient.discovery import build
2769from db.user_emails import UserEmails
2770from db import processing_tasks as task_models
2771from db.utils.user_email_utils import create_user_email
2772from utils.auth_utils import AuthenticatedUser
2773from utils.email_utils import get_email_ids, get_email
2774from utils.llm_utils import process_email
2775from utils.config_utils import get_settings
2776from session.session_layer import validate_session
2777import database
2778from google.oauth2.credentials import Credentials
2779import json
2780from start_date.storage import get_start_date_email_filter
2781from constants import QUERY_APPLIED_EMAIL_FILTER
2782from datetime import datetime, timedelta
2783from slowapi import Limiter
2784from slowapi.util import get_remote_address
2785
2786limiter = Limiter(key_func=get_remote_address)
2787
2788# Logger setup
2789logger = logging.getLogger(__name__)
2790
2791# Get settings
2792settings = get_settings()
2793APP_URL = settings.APP_URL
2794
2795SECONDS_BETWEEN_FETCHING_EMAILS = 1 * 60 * 60 # 1 hour
2796
2797# FastAPI router for email routes
2798router = APIRouter()
2799
2800@router.get("/processing", response_class=HTMLResponse)
2801async def processing(request: Request, db_session: database.DBSession, user_id: str = Depends(validate_session)):
2802 logging.info("user_id:%s processing", user_id)
2803 if not user_id:
2804 logger.info("user_id: not found, redirecting to login")
2805 return RedirectResponse("/logout", status_code=303)
2806
2807 process_task_run: task_models.TaskRuns = db_session.get(task_models.TaskRuns, user_id)
2808
2809 if process_task_run is None:
2810 raise HTTPException(
2811 status_code=404, detail="Processing has not started."
2812 )
2813
2814 if process_task_run.status == task_models.FINISHED:
2815 logger.info("user_id: %s processing complete", user_id)
2816 return JSONResponse(
2817 content={
2818 "message": "Processing complete",
2819 "processed_emails": process_task_run.processed_emails,
2820 "total_emails": process_task_run.total_emails,
2821 }
2822 )
2823 else:
2824 logger.info("user_id: %s processing not complete for file", user_id)
2825 return JSONResponse(
2826 content={
2827 "message": "Processing in progress",
2828 "processed_emails": process_task_run.processed_emails,
2829 "total_emails": process_task_run.total_emails,
2830 }
2831 )
2832
2833
2834@router.get("/get-emails", response_model=List[UserEmails])
2835@limiter.limit("5/minute")
2836def query_emails(request: Request, db_session: database.DBSession, user_id: str = Depends(validate_session)) -> None:
2837 try:
2838 logger.info(f"Fetching emails for user_id: {user_id}")
2839
2840 # Query emails sorted by date (newest first)
2841 statement = select(UserEmails).where(UserEmails.user_id == user_id).order_by(desc(UserEmails.received_at))
2842 user_emails = db_session.exec(statement).all()
2843
2844 logger.info(f"Found {len(user_emails)} emails for user_id: {user_id}")
2845 return user_emails # Return empty list if no emails exist
2846
2847 except Exception as e:
2848 logger.error(f"Error fetching emails for user_id {user_id}: {e}")
2849 raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
2850
2851
2852@router.delete("/delete-email/{email_id}")
2853async def delete_email(request: Request, db_session: database.DBSession, email_id: str, user_id: str = Depends(validate_session)):
2854 """
2855 Delete an email record by its ID for the authenticated user.
2856 """
2857 try:
2858 # Query the email record to ensure it exists and belongs to the user
2859 email_record = db_session.exec(
2860 select(UserEmails).where(
2861 (UserEmails.id == email_id) & (UserEmails.user_id == user_id)
2862 )
2863 ).first()
2864
2865 if not email_record:
2866 logger.warning(f"Email with id {email_id} not found for user_id {user_id}")
2867 raise HTTPException(
2868 status_code=404, detail=f"Email with id {email_id} not found"
2869 )
2870
2871 # Delete the email record
2872 db_session.delete(email_record)
2873 db_session.flush()
2874
2875 logger.info(f"Email with id {email_id} deleted successfully for user_id {user_id}")
2876 return {"message": "Item deleted successfully"}
2877
2878 except Exception as e:
2879 logger.error(f"Error deleting email with id {email_id} for user_id {user_id}: {e}")
2880 raise HTTPException(
2881 status_code=500, detail=f"Failed to delete email: {str(e)}"
2882 )
2883
2884
2885@router.post("/fetch-emails")
2886@limiter.limit("5/minute")
2887async def start_fetch_emails(
2888 request: Request, background_tasks: BackgroundTasks, user_id: str = Depends(validate_session)
2889):
2890 """Starts the background task for fetching and processing emails."""
2891
2892 if not user_id:
2893 raise HTTPException(status_code=403, detail="Unauthorized")
2894 logger.info(f"user_id:{user_id} start_fetch_emails")
2895 # Retrieve stored credentials
2896 creds_json = request.session.get("creds")
2897 if not creds_json:
2898 logger.error(f"Missing credentials for user_id: {user_id}")
2899 return HTMLResponse(content="User not authenticated. Please log in again.", status_code=401)
2900
2901 try:
2902 # Convert JSON string back to Credentials object
2903 creds_dict = json.loads(creds_json)
2904 creds = Credentials.from_authorized_user_info(creds_dict) # Convert dict to Credentials
2905 user = AuthenticatedUser(creds)
2906
2907 logger.info(f"Starting email fetching process for user_id: {user_id}")
2908
2909 # Start email fetching in the background
2910 background_tasks.add_task(fetch_emails_to_db, user, request, user_id=user_id)
2911
2912 return JSONResponse(content={"message": "Email fetching started"}, status_code=200)
2913 except Exception as e:
2914 logger.error(f"Error reconstructing credentials: {e}")
2915 raise HTTPException(status_code=500, detail="Failed to authenticate user")
2916
2917
2918def fetch_emails_to_db(user: AuthenticatedUser, request: Request, last_updated: Optional[datetime] = None, *, user_id: str) -> None:
2919 logger.info(f"Fetching emails to db for user_id: {user_id}")
2920
2921 with Session(database.engine) as db_session:
2922 # we track starting and finishing fetching of emails for each user
2923 process_task_run = (
2924 db_session.query(task_models.TaskRuns).filter_by(user_id=user_id).one_or_none()
2925 )
2926 if process_task_run is None:
2927 # if this is the first time running the task for the user, create a record
2928 process_task_run = task_models.TaskRuns(user_id=user_id)
2929 db_session.add(process_task_run)
2930 elif datetime.now() - process_task_run.updated < timedelta(
2931 seconds=SECONDS_BETWEEN_FETCHING_EMAILS
2932 ):
2933 # limit how frequently emails can be fetched by a specific user
2934 logger.warning(
2935 "Less than an hour since last fetch of emails for user",
2936 extra={"user_id": user_id},
2937 )
2938 return
2939
2940 # this is helpful if the user applies for a new job and wants to rerun the analysis during the same session
2941 process_task_run.processed_emails = 0
2942 process_task_run.total_emails = 0
2943 process_task_run.status = task_models.STARTED
2944
2945 db_session.commit() # sync with the database so calls in the future reflect the task is already started
2946
2947 start_date = request.session.get("start_date")
2948 logger.info(f"start_date: {start_date}")
2949 start_date_query = get_start_date_email_filter(start_date)
2950 is_new_user = request.session.get("is_new_user")
2951
2952 query = start_date_query
2953 # check for users last updated email
2954 if last_updated:
2955 # this converts our date time to number of seconds
2956 additional_time = int(last_updated.timestamp())
2957 # we append it to query so we get only emails recieved after however many seconds
2958 # for example, if the newest email you’ve stored was received at 2025‑03‑20 14:32 UTC, we convert that to 1710901920s
2959 # and tell Gmail to fetch only messages received after March 20, 2025 at 14:32 UTC.
2960 if not start_date or not is_new_user:
2961 query = QUERY_APPLIED_EMAIL_FILTER
2962 query += f" after:{additional_time}"
2963
2964 logger.info(f"user_id:{user_id} Fetching emails after {last_updated.isoformat()}")
2965 else:
2966 logger.info(f"user_id:{user_id} Fetching all emails (no last_date maybe with start date)")
2967 logger.info(
2968 f"user_id:{user_id} Fetching all emails (no last_date maybe with start date)"
2969 )
2970
2971 service = build("gmail", "v1", credentials=user.creds)
2972
2973 messages = get_email_ids(
2974 query=query, gmail_instance=service
2975 )
2976 # Update session to remove "new user" status
2977 request.session["is_new_user"] = False
2978
2979 if not messages:
2980 logger.info(f"user_id:{user_id} No job application emails found.")
2981 process_task_run = db_session.get(task_models.TaskRuns, user_id)
2982 process_task_run.status = task_models.FINISHED
2983 db_session.commit()
2984 return
2985
2986 logger.info(f"user_id:{user.user_id} Found {len(messages)} emails.")
2987 process_task_run.total_emails = len(messages)
2988 db_session.commit()
2989
2990 email_records = [] # list to collect email records
2991
2992 for idx, message in enumerate(messages):
2993 message_data = {}
2994 # (email_subject, email_from, email_domain, company_name, email_dt)
2995 msg_id = message["id"]
2996 logger.info(
2997 f"user_id:{user_id} begin processing for email {idx + 1} of {len(messages)} with id {msg_id}"
2998 )
2999 process_task_run.processed_emails = idx + 1
3000 db_session.commit()
3001
3002 msg = get_email(message_id=msg_id, gmail_instance=service)
3003
3004 if msg:
3005 try:
3006 result = process_email(msg["text_content"])
3007 # if values are empty strings or null, set them to "unknown"
3008 for key in result.keys():
3009 if not result[key]:
3010 result[key] = "unknown"
3011 except Exception as e:
3012 logger.error(
3013 f"user_id:{user_id} Error processing email {idx + 1} of {len(messages)} with id {msg_id}: {e}"
3014 )
3015
3016 if not isinstance(result, str) and result:
3017 logger.info(
3018 f"user_id:{user_id} successfully extracted email {idx + 1} of {len(messages)} with id {msg_id}"
3019 )
3020 if result.get("job_application_status").lower() == "false positive, not related to job search":
3021 logger.info(
3022 f"user_id:{user_id} email {idx + 1} of {len(messages)} with id {msg_id} is a false positive, not related to job search"
3023 )
3024 continue # skip this email if it's a false positive
3025 else: # processing returned unknown which is also likely false positive
3026 logger.warning(
3027 f"user_id:{user_id} failed to extract email {idx + 1} of {len(messages)} with id {msg_id}"
3028 )
3029 result = {"company_name": "unknown", "application_status": "unknown", "job_title": "unknown"}
3030
3031 message_data = {
3032 "id": msg_id,
3033 "company_name": result.get("company_name", "unknown"),
3034 "application_status": result.get("job_application_status", "unknown"),
3035 "received_at": msg.get("date", "unknown"),
3036 "subject": msg.get("subject", "unknown"),
3037 "job_title": result.get("job_title", "unknown"),
3038 "from": msg.get("from", "unknown"),
3039 }
3040 email_record = create_user_email(user, message_data)
3041 if email_record:
3042 email_records.append(email_record)
3043
3044 # batch insert all records at once
3045 if email_records:
3046 db_session.add_all(email_records)
3047 logger.info(
3048 f"Added {len(email_records)} email records for user {user_id}"
3049 )
3050
3051 process_task_run.status = task_models.FINISHED
3052 db_session.commit()
3053
3054 logger.info(f"user_id:{user_id} Email fetching complete.")
3055
3056
3057---
3058target_repo/backend/routes/file_routes.py
3059---
3060import csv
3061import os
3062import logging
3063import plotly.graph_objects as go
3064from fastapi import APIRouter, HTTPException, Request, Depends
3065from fastapi.responses import FileResponse, RedirectResponse
3066from slowapi import Limiter
3067from slowapi.util import get_remote_address
3068import database
3069from utils.file_utils import get_user_filepath
3070from session.session_layer import validate_session
3071from routes.email_routes import query_emails
3072
3073
3074# Logger setup
3075logger = logging.getLogger(__name__)
3076
3077# FastAPI router for file routes
3078router = APIRouter()
3079limiter = Limiter(key_func=get_remote_address)
3080
3081@router.get("/download-file")
3082async def download_file(request: Request, user_id: str = Depends(validate_session)):
3083 if not user_id:
3084 return RedirectResponse("/logout", status_code=303)
3085 directory = get_user_filepath(user_id)
3086 filename = "emails.csv"
3087 filepath = f"{directory}/{filename}"
3088 if os.path.exists(filepath):
3089 logger.info("user_id:%s downloading from filepath %s", user_id, filepath)
3090 return FileResponse(filepath)
3091 raise HTTPException(status_code=400, detail="File not found")
3092
3093
3094@router.get("/write-to-csv")
3095async def write_to_csv(request: Request, db_session: database.DBSession, user_id: str = Depends(validate_session)):
3096 if not user_id:
3097 return RedirectResponse("/logout", status_code=303)
3098
3099 # Get job related email data from DB
3100 emails = query_emails(request, db_session=db_session, user_id=user_id)
3101 if not emails:
3102 raise HTTPException(status_code=400, detail="No data found to write")
3103
3104 directory = get_user_filepath(user_id)
3105 os.makedirs(directory, exist_ok=True) # Ensure the directory exists
3106
3107 filename = "emails.csv"
3108 filepath = os.path.join(directory, filename)
3109
3110 # Key: DB field name; Value: Human-readable field name
3111 field_mapping = {
3112 "company_name": "Company Name",
3113 "application_status": "Application Status",
3114 "received_at": "Received At",
3115 "subject": "Subject",
3116 "email_from": "Sender"
3117 }
3118
3119 selected_fields = list(field_mapping.keys())
3120 headers = list(field_mapping.values())
3121
3122 # Filter out unwanted fields
3123 processed_emails = [
3124 {key: value for key, value in email if key in selected_fields} for email in emails
3125 ]
3126
3127 # Write to CSV
3128 with open(filepath, mode="w", newline="") as file:
3129 writer = csv.writer(file)
3130 writer.writerow(headers)
3131 for row in processed_emails:
3132 writer.writerow([row[field] for field in selected_fields])
3133
3134 logger.info("CSV file created at %s", filepath)
3135 return {"message": f"CSV file written successfully at {filepath}"}
3136
3137
3138# Write and download csv
3139@router.get("/process-csv")
3140@limiter.limit("2/minute")
3141async def process_csv(request: Request, db_session: database.DBSession, user_id: str = Depends(validate_session)):
3142 if not user_id:
3143 return RedirectResponse("/logout", status_code=303)
3144
3145 directory = get_user_filepath(user_id)
3146 filename = "emails.csv"
3147 filepath = os.path.join(directory, filename)
3148
3149 # Get job related email data from DB
3150 emails = query_emails(request, db_session=db_session, user_id=user_id)
3151 if not emails:
3152 raise HTTPException(status_code=400, detail="No data found to write")
3153 # Ensure the directory exists
3154 os.makedirs(directory, exist_ok=True)
3155
3156 # Key: DB field name; Value: Human-readable field name
3157 field_mapping = {
3158 "company_name": "Company Name",
3159 "application_status": "Application Status",
3160 "received_at": "Received At",
3161 "job_title": "Job Title",
3162 "subject": "Subject",
3163 "email_from": "Sender"
3164 }
3165
3166 selected_fields = list(field_mapping.keys())
3167 headers = list(field_mapping.values())
3168
3169 # Filter out unwanted fields
3170 processed_emails = [
3171 {key: value for key, value in email if key in selected_fields} for email in emails
3172 ]
3173
3174 # Write to CSV
3175 with open(filepath, mode="w", newline="") as file:
3176 writer = csv.writer(file)
3177 writer.writerow(headers)
3178 for row in processed_emails:
3179 writer.writerow([row[field] for field in selected_fields])
3180
3181 logger.info("CSV file created at %s", filepath)
3182
3183 # Download CSV
3184 if os.path.exists(filepath):
3185 logger.info("user_id:%s downloading from filepath %s", user_id, filepath)
3186 return FileResponse(filepath)
3187
3188 # File not found error
3189 raise HTTPException(status_code=400, detail="File not found")
3190
3191
3192# Write and download sankey diagram
3193@router.get("/process-sankey")
3194@limiter.limit("2/minute")
3195async def process_sankey(request: Request, db_session: database.DBSession, user_id: str = Depends(validate_session)):
3196 # Validate user session, redirect if invalid
3197 if not user_id:
3198 return RedirectResponse("/logout", status_code=303)
3199
3200 num_applications = 0
3201 num_offers = 0
3202 num_rejected = 0
3203 num_request_for_availability = 0
3204 num_interview_scheduled = 0
3205 num_no_response = 0
3206
3207 # Get job related email data from DB
3208 emails = query_emails(request, db_session=db_session, user_id=user_id)
3209 if not emails:
3210 raise HTTPException(status_code=400, detail="No data found to write")
3211
3212 for email in emails:
3213 # normalize the output
3214 status = email.application_status.strip().lower()
3215 num_applications += 1
3216 if status == "offer":
3217 num_offers += 1
3218 elif status == "rejected":
3219 num_rejected += 1
3220 elif status == "request for availability":
3221 num_request_for_availability += 1
3222 elif status == "interview scheduled":
3223 num_interview_scheduled += 1
3224 elif status == "no response":
3225 num_no_response += 1
3226
3227 # Create the Sankey diagram
3228 fig = go.Figure(go.Sankey(
3229 node=dict(label=[f"Applications ({num_applications})",
3230 f"Offers ({num_offers})",
3231 f"Rejected ({num_rejected})",
3232 f"Request for Availability ({num_request_for_availability})",
3233 f"Interview Scheduled ({num_interview_scheduled})",
3234 f"No Response ({num_no_response})"]),
3235 link=dict(source=[0, 0, 0, 0, 0], target=[1, 2, 3, 4, 5],
3236 value=[num_offers, num_rejected, num_request_for_availability, num_interview_scheduled, num_no_response])))
3237
3238
3239 # Define the user's file path and ensure the directory exists
3240 directory = get_user_filepath(user_id)
3241 filename = "sankey_diagram.png"
3242 filepath = os.path.join(directory, filename)
3243
3244 # Ensure the directory exists
3245 os.makedirs(directory, exist_ok=True)
3246
3247 try:
3248 # Save the Sankey diagram as PNG
3249 fig.write_image(filepath) # Requires Kaleido for image export
3250 logger.info("user_id:%s Sankey diagram saved to %s", user_id, filepath)
3251
3252 # Return the file with correct headers and explicit filename
3253 return FileResponse(
3254 filepath,
3255 media_type="image/png", # Correct media type for PNG
3256 filename=filename,
3257 headers={"Content-Disposition": f"attachment; filename={filename}"} # Ensure correct filename in header
3258 )
3259 except Exception as e:
3260 logger.error("Error generating Sankey diagram for user_id:%s - %s", user_id, str(e))
3261 raise HTTPException(status_code=500, detail="Error generating Sankey diagram")
3262
3263
3264
3265---
3266target_repo/backend/routes/start_date_routes.py
3267---
3268import logging
3269from fastapi import APIRouter, Request, Form, Depends
3270from fastapi.responses import JSONResponse, HTMLResponse
3271from db.utils.user_utils import add_user
3272import json
3273from utils.auth_utils import AuthenticatedUser
3274from google.oauth2.credentials import Credentials
3275from session.session_layer import validate_session
3276from slowapi import Limiter
3277from slowapi.util import get_remote_address
3278
3279limiter = Limiter(key_func=get_remote_address)
3280
3281# Logger setup
3282logger = logging.getLogger(__name__)
3283
3284api_call_finished = False
3285
3286# FastAPI router for email routes
3287router = APIRouter()
3288
3289@router.post("/set-start-date")
3290@limiter.limit("1/minute")
3291async def set_start_date(request: Request, start_date: str = Form(...), user_id: str = Depends(validate_session)):
3292 """Updates the user's job search start date in the database."""
3293 user_id = request.session.get("user_id")
3294
3295 if not user_id:
3296 return HTMLResponse(content="Invalid request. Please log in again.", status_code=400)
3297
3298 # Retrieve stored credentials
3299 creds_json = request.session.get("creds")
3300 if not creds_json:
3301 logger.error(f"user_id:{user_id} missing credentials /set-start-date")
3302 return HTMLResponse(content="User not authenticated. Please log in again.", status_code=401)
3303
3304 try:
3305 # Convert JSON string back to Credentials object
3306 creds_dict = json.loads(creds_json)
3307 creds = Credentials.from_authorized_user_info(creds_dict) # Convert dict to Credentials
3308 user = AuthenticatedUser(creds, start_date) # Corrected: Now passing Credentials object
3309
3310 # Save start date in DB
3311 add_user(user, request, start_date)
3312
3313 # Update session to remove "new user" status
3314 request.session["is_new_user"] = False
3315
3316 logger.info(f"user_id:{user_id} added start date {start_date}")
3317
3318 return JSONResponse(content={"message": "Start date updated successfully"}, status_code=200)
3319 except Exception as e:
3320 logger.error(f"Error reconstructing credentials: {e}")
3321 return HTMLResponse(content="Failed to save start date. Try again.", status_code=500)
3322
3323def get_start_date(request: Request, user_id: str = Depends(validate_session)) -> str:
3324 """Fetches the user's job search start date from the database."""
3325 # Query the database for the user's start date
3326 logger.info(f"Getting start date for user_id: {user_id}")
3327 return request.session.get("start_date")
3328
3329
3330@router.get("/api/session-data")
3331@limiter.limit("5/minute")
3332async def get_session_data(request: Request, user_id: str = Depends(validate_session)):
3333 """Fetches session data for the user."""
3334
3335 user_id = request.session.get("user_id")
3336 token_expiry = request.session.get("token_expiry")
3337 session_id = request.session.get("session_id")
3338 is_new_user = request.session.get("is_new_user", False)
3339
3340 logger.info(f"Fetching session data: user_id={user_id}, session_id={session_id}")
3341
3342 if not user_id:
3343 logger.warning("Session data missing user_id. Possible expired or invalid session.")
3344 return JSONResponse(content={"error": "Session expired or invalid"}, status_code=401)
3345
3346 session_data = {
3347 "user_id": user_id,
3348 "token_expiry": token_expiry,
3349 "session_id": session_id,
3350 "is_new_user": is_new_user,
3351 }
3352
3353 logger.info(f"Session data being returned: {session_data}")
3354
3355 return JSONResponse(content=session_data)
3356
3357---
3358target_repo/backend/routes/users_routes.py
3359---
3360import logging
3361from fastapi import APIRouter, Depends, Request, HTTPException
3362from sqlmodel import select
3363from db.user_emails import UserEmails
3364from utils.config_utils import get_settings
3365from session.session_layer import validate_session
3366from routes.email_routes import query_emails
3367import database
3368from slowapi import Limiter
3369from slowapi.util import get_remote_address
3370
3371
3372# Logger setup
3373logger = logging.getLogger(__name__)
3374
3375# Get settings
3376settings = get_settings()
3377APP_URL = settings.APP_URL
3378
3379api_call_finished = False
3380
3381# FastAPI router for email routes
3382router = APIRouter()
3383limiter = Limiter(key_func=get_remote_address)
3384
3385@router.get("/get-response-rate")
3386@limiter.limit("2/minute")
3387def response_rate_by_job_title(request: Request, db_session: database.DBSession, user_id: str = Depends(validate_session)):
3388
3389 try:
3390 # Get job related email data from DB
3391 user_emails = query_emails(request, db_session=db_session, user_id=user_id)
3392
3393 index = 0
3394
3395 # Tracks all job titles and their index in response_rate
3396 job_titles = {}
3397
3398 # Store (company, job_title) tuples to avoid duplicates
3399 companies = []
3400
3401 # List of dictionaries to store job titles and their response rates
3402 response_rate_data = []
3403
3404 for email in user_emails:
3405 if email.job_title not in job_titles:
3406 status = email.application_status.strip().lower()
3407 if status == "request for availability" or status == "offer" or status == "interview scheduled":
3408 response_rate_data.append({"title": email.job_title, "responses": 1, "total": 1})
3409 else:
3410 response_rate_data.append({"title": email.job_title, "responses": 0, "total": 1})
3411 companies.append((email.company_name, email.job_title))
3412 job_titles[email.job_title] = index
3413 index += 1
3414 elif (email.company_name, email.job_title) not in companies:
3415 status = email.application_status.strip().lower()
3416 if status == "request for availability" or status == "offer" or status == "interview scheduled":
3417 response_rate_data[job_titles[email.job_title]]["responses"] += 1
3418 response_rate_data[job_titles[email.job_title]]["total"] += 1
3419 companies.append((email.company_name, email.job_title))
3420
3421 response_rate = []
3422 for data in response_rate_data:
3423 response_rate.append({
3424 "title": data["title"],
3425 "rate": round(data["responses"] / data["total"] * 100, 2)
3426 })
3427
3428 return response_rate
3429
3430 except Exception as e:
3431 logger.error(f"Error fetching job titles for user_id {user_id}: {e}")
3432 raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
3433
3434@router.get("/user-response-rate")
3435def calculate_response_rate(
3436 request: Request, db_session: database.DBSession, user_id: str = Depends(validate_session)
3437) -> None:
3438 user_emails = db_session.exec(
3439 select(UserEmails).where(UserEmails.user_id == user_id)
3440 ).all()
3441
3442 # if user has no application just return 0.0
3443 total_apps = len(user_emails)
3444 if total_apps == 0:
3445 return 0.0
3446
3447 interview_requests = 0
3448 for email in user_emails:
3449 # using request for avalability as an interview request as it should come before the offer and scheduled interview
3450 if (
3451 email.application_status
3452 and email.application_status.lower() == "request for availability"
3453 ):
3454 interview_requests += 1
3455
3456 response_rate_percent = (interview_requests / total_apps) * 100
3457 return {"value": round(response_rate_percent, 1)}
3458
3459
3460
3461---
3462target_repo/backend/session/session_layer.py
3463---
3464# app/session/session_layer.py
3465import logging
3466import secrets
3467from datetime import datetime
3468from fastapi import Request
3469from utils.config_utils import get_settings
3470
3471settings = get_settings()
3472
3473def create_random_session_string() -> str:
3474 return secrets.token_urlsafe(32) # Generates a random URL-safe string
3475
3476
3477def validate_session(request: Request) -> str:
3478 """Retrieves Authorization, session_id, access_token and token_expiry
3479 from request cookies and validates them.
3480 Session ID should match the stored session.
3481 Access token should not be expired.
3482 """
3483 if settings.is_publicly_deployed:
3484 session_authorization = request.cookies.get("__Secure-Authorization")
3485 else:
3486 session_authorization = request.cookies.get("Authorization")
3487
3488 session_id = request.session.get("session_id")
3489 session_access_token = request.session.get("access_token")
3490 token_exp = request.session.get("token_expiry")
3491 user_id = request.session.get("user_id")
3492
3493 if not session_authorization and not session_access_token:
3494 logging.info(
3495 "No Authorization and access_token in session, redirecting to login"
3496 )
3497 return ""
3498
3499 if session_authorization != session_id:
3500 logging.info("Authorization does not match Session Id, redirecting to login")
3501 return ""
3502
3503 if is_token_expired(token_exp):
3504 logging.info("Access_token is expired, redirecting to login")
3505 return ""
3506
3507 logging.info("Valid Session, Access granted.")
3508 return user_id
3509
3510
3511def is_token_expired(iso_expiry: str) -> bool:
3512 """
3513 Converts ISO format timestamp (which serves as the expiry time of the token) to datetime.
3514 If the current time is greater than the expiry time,
3515 the token is expired.
3516 """
3517 if iso_expiry:
3518 datetime_expiry = datetime.fromisoformat(iso_expiry) # UTC time
3519 difference_in_minutes = (
3520 datetime_expiry - datetime.utcnow()
3521 ).total_seconds() / 60
3522 return difference_in_minutes <= 0
3523
3524 return True
3525
3526
3527---