From e1f41307f294b097250fa166be0c306e2b220f36 Mon Sep 17 00:00:00 2001 From: Matt Wiseley Date: Sat, 18 Oct 2025 13:14:35 -0400 Subject: [PATCH] Add API debug output and optimize comment fetching for completed tasks --- README.md | 5 +- export_todoist.py | 229 +++++++++++++++++++++++++++++++++++----------- 2 files changed, 179 insertions(+), 55 deletions(-) diff --git a/README.md b/README.md index 70a4af2..d075a17 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ Todoist is a SaaS task manager. Todoist provides backups of current tasks, but d - Downloads attachments to `output/attachments/` and references them in the JSON and HTML output - JSON and HTML files are named with the current date when the script is run - Maintains `Todoist-Completed-History.json` so completed tasks older than Todoist's 90-day API window stay in future exports +- Reuses archived comments for completed tasks to avoid unnecessary API calls (assumes no new comments after completion) ## Setup - Ensure you have Python 3.8 or newer installed. Check with `python --version` on the command line. @@ -26,8 +27,8 @@ Todoist is a SaaS task manager. Todoist provides backups of current tasks, but d ```bash python export_todoist.py export ``` - This will create `output/Todoist-Actual-Backup-YYYY-MM-DD.json` and `output/Todoist-Actual-Backup-YYYY-MM-DD.html`, and it will update `output/attachments/` with any downloaded files while leaving `Todoist-Completed-History.json` in the project root. - Keep `Todoist-Completed-History.json` somewhere safe (e.g., in source control or a backup location); it is the only way the exporter can retain completions older than Todoist's 90-day API retention window. + This will create `output/Todoist-Actual-Backup-YYYY-MM-DD.json` and `output/Todoist-Actual-Backup-YYYY-MM-DD.html`, and it will update `output/attachments/` with any downloaded files while leaving `Todoist-Completed-History.json` in the project root. + Keep `Todoist-Completed-History.json` somewhere safe (e.g., in source control or a backup location); it is the only way the exporter can retain completions older than Todoist's 90-day API retention window. 4. To see usage instructions, run the script with no arguments or any argument other than `export`. ## Requirements diff --git a/export_todoist.py b/export_todoist.py index 96555bb..95362cd 100644 --- a/export_todoist.py +++ b/export_todoist.py @@ -4,7 +4,9 @@ import json import time import getpass import shutil +import copy from collections import defaultdict +from urllib.parse import quote_plus import requests from datetime import datetime, timedelta from todoist_api_python.api import TodoistAPI @@ -15,6 +17,12 @@ ATTACHMENTS_DIR = os.path.join(OUTPUT_DIR, "attachments") LEGACY_ATTACHMENTS_DIR = "attachments" TODOIST_API_TOKEN: str | None = None COMPLETED_HISTORY_FILE = "Todoist-Completed-History.json" +COMMENT_REQUEST_MIN_INTERVAL = 0.5 # seconds +COMMENT_MAX_ATTEMPTS = 8 +PROJECTS_URL = "https://api.todoist.com/rest/v2/projects" +TASKS_URL = "https://api.todoist.com/rest/v2/tasks" +COMPLETED_TASKS_URL = "https://api.todoist.com/api/v1/tasks/completed/by_completion_date" +COMMENTS_URL = "https://api.todoist.com/api/v1/comments" def json_serial(obj): @@ -119,28 +127,63 @@ def normalize_timestamp(value): return str(value) +def make_completed_task_key_from_dict(task): + task_id = str(task.get('id', '')) if isinstance(task, dict) else "" + if not task_id: + return None + completed_at = normalize_timestamp(task.get('completed_at')) + if not completed_at: + completed_at = normalize_timestamp(task.get('updated_at')) + return (task_id, completed_at) + + +def make_completed_task_key_from_api(task): + task_id = getattr(task, "id", None) + if not task_id: + return None + completed_at = normalize_timestamp(getattr(task, "completed_at", None)) + if not completed_at: + completed_at = normalize_timestamp(getattr(task, "updated_at", None)) + return (str(task_id), completed_at) + + def merge_completed_lists(history_tasks, new_tasks): merged = [] - seen = set() + index_by_key = {} - def make_key(task): - task_id = str(task.get('id', '')) - completed_at = normalize_timestamp(task.get('completed_at')) - if not completed_at: - completed_at = normalize_timestamp(task.get('updated_at')) - return (task_id, completed_at) + def merge_task_dicts(primary, secondary, prefer_primary=True): + for key, value in secondary.items(): + if key == 'comments': + if (not primary.get('comments')) and value: + primary['comments'] = value + continue + if key == 'attachments': + if (not primary.get('attachments')) and value: + primary['attachments'] = value + continue + if key not in primary or primary[key] in (None, "", [], {}): + primary[key] = value + continue + if not prefer_primary: + primary[key] = value + return primary - def add_task(task): - key = make_key(task) - if key in seen: + def add_or_merge(task, prefer_existing=True): + key = make_completed_task_key_from_dict(task) + if key is None: + merged.append(task) return - seen.add(key) - merged.append(task) + if key in index_by_key: + idx = index_by_key[key] + merge_task_dicts(merged[idx], task, prefer_primary=prefer_existing) + else: + merged.append(task) + index_by_key[key] = len(merged) - 1 for item in new_tasks: - add_task(item) + add_or_merge(item, prefer_existing=True) for item in history_tasks: - add_task(item) + add_or_merge(item, prefer_existing=True) def sort_key(task): completed_at = normalize_timestamp(task.get('completed_at')) @@ -232,11 +275,12 @@ def _get_retry_delay(response, attempt, base_delay=5, max_delay=120): return min(max_delay, base_delay * (2 ** attempt)) -def execute_with_rate_limit(func, *args, **kwargs): +def execute_with_rate_limit(func, *args, max_attempts=5, request_desc=None, **kwargs): attempts = 0 - max_attempts = 5 + desc = request_desc or getattr(func, "__name__", "call") while True: try: + print(f" Calling {desc}") return func(*args, **kwargs) except Exception as error: # pylint: disable=broad-except status_code = getattr(error, "status_code", None) @@ -246,7 +290,9 @@ def execute_with_rate_limit(func, *args, **kwargs): if status_code == 429 and attempts < max_attempts: delay = _get_retry_delay(response, attempts) attempts += 1 - print(f"Rate limit hit for {func.__name__}. Waiting {delay} seconds before retry {attempts}/{max_attempts}...") + print(f" Rate limit hit for {desc}. Waiting {delay} seconds before retry {attempts}/{max_attempts}...") + if delay > 1: + print(f" Waiting {delay} seconds due to rate limiting") time.sleep(delay) continue raise @@ -255,7 +301,10 @@ def execute_with_rate_limit(func, *args, **kwargs): def fetch_all_projects(api): projects_by_id = {} try: - projects_iter = execute_with_rate_limit(api.get_projects) + projects_iter = execute_with_rate_limit( + api.get_projects, + request_desc=f"GET {PROJECTS_URL}" + ) for batch in projects_iter: for project in batch: projects_by_id[str(getattr(project, "id", ""))] = project @@ -267,7 +316,10 @@ def fetch_all_projects(api): def fetch_active_tasks_by_project(api): tasks_by_project = defaultdict(list) try: - tasks_iter = execute_with_rate_limit(api.get_tasks) + tasks_iter = execute_with_rate_limit( + api.get_tasks, + request_desc=f"GET {TASKS_URL}" + ) for batch in tasks_iter: for task in batch: tasks_by_project[str(getattr(task, "project_id", ""))].append(task) @@ -280,8 +332,10 @@ def fetch_active_tasks_by_project(api): def fetch_completed_tasks_by_project(api, since, until): tasks_by_project = defaultdict(list) try: + query = f"?since={since.isoformat()}&until={until.isoformat()}" completed_iter = execute_with_rate_limit( api.get_completed_tasks_by_completion_date, + request_desc=f"GET {COMPLETED_TASKS_URL}{query}", since=since, until=until, ) @@ -297,29 +351,78 @@ def fetch_completed_tasks_by_project(api, since, until): def fetch_comments_by_task(api, project_ids, task_ids): comments_by_task = defaultdict(list) total_comments = 0 + last_comment_call = 0.0 + + def throttled_get_comments(**kwargs): + nonlocal last_comment_call + elapsed = time.time() - last_comment_call + if elapsed < COMMENT_REQUEST_MIN_INTERVAL: + time.sleep(COMMENT_REQUEST_MIN_INTERVAL - elapsed) + params = [] + for key, value in kwargs.items(): + if value is None: + continue + params.append(f"{key}={quote_plus(str(value))}") + query = "&".join(params) + desc = f"GET {COMMENTS_URL}{('?' + query) if query else ''}" + result = execute_with_rate_limit( + api.get_comments, + max_attempts=COMMENT_MAX_ATTEMPTS, + request_desc=desc, + **kwargs, + ) + last_comment_call = time.time() + return result + + def handle_comment_error(scope, identifier, error): + status_code = getattr(error, "status_code", None) + response = getattr(error, "response", None) + if status_code is None and response is not None: + status_code = getattr(response, "status_code", None) + if status_code == 404: + print(f" Comments not found for {scope} {identifier} (404). Skipping.") + return False + if status_code == 429: + delay = _get_retry_delay(response, COMMENT_MAX_ATTEMPTS) + print( + f" Rate limit while fetching comments for {scope} {identifier} after retries; waiting {delay} seconds before continuing." + ) + if delay > 1: + print(f" Waiting {delay} seconds due to rate limiting") + time.sleep(delay) + return True + print(f" Error fetching comments for {scope} {identifier}: {error}") + return False + for project_id in project_ids: - try: - comments_iter = execute_with_rate_limit(api.get_comments, project_id=project_id) - for batch in comments_iter: - for comment in batch: - task_id = str(getattr(comment, "task_id", "")) - if task_id: - comments_by_task[task_id].append(comment) - total_comments += 1 - except Exception as error: # pylint: disable=broad-except - print(f"Error fetching comments for project {project_id}: {error}") + while True: + try: + comments_iter = throttled_get_comments(project_id=project_id) + for batch in comments_iter: + for comment in batch: + task_id = str(getattr(comment, "task_id", "")) + if task_id: + comments_by_task[task_id].append(comment) + total_comments += 1 + break + except Exception as error: # pylint: disable=broad-except + if not handle_comment_error("project", project_id, error): + break missing_task_ids = [task_id for task_id in task_ids if task_id not in comments_by_task] for task_id in missing_task_ids: - try: - comments_iter = execute_with_rate_limit(api.get_comments, task_id=task_id) - for batch in comments_iter: - for comment in batch: - key = str(getattr(comment, "task_id", "")) - if key: - comments_by_task[key].append(comment) - total_comments += 1 - except Exception as error: # pylint: disable=broad-except - print(f"Error fetching comments for task {task_id}: {error}") + while True: + try: + comments_iter = throttled_get_comments(task_id=task_id) + for batch in comments_iter: + for comment in batch: + key = str(getattr(comment, "task_id", "")) + if key: + comments_by_task[key].append(comment) + total_comments += 1 + break + except Exception as error: # pylint: disable=broad-except + if not handle_comment_error("task", task_id, error): + break print( f"Fetched {total_comments} comments mapped to {len(comments_by_task)} tasks" ) @@ -429,26 +532,37 @@ def main(): until = datetime.now() active_tasks_by_project = fetch_active_tasks_by_project(api) completed_tasks_by_project = fetch_completed_tasks_by_project(api, since=since, until=until) - comment_project_ids = sorted( + completed_history = load_completed_history() + history_by_key = {} + for task_list in completed_history.values(): + for stored_task in task_list: + key = make_completed_task_key_from_dict(stored_task) + if key: + history_by_key[key] = stored_task + + active_comment_project_ids = sorted( pid - for pid in (set(active_tasks_by_project.keys()) | set(completed_tasks_by_project.keys())) - if pid + for pid, tasks in active_tasks_by_project.items() + if pid and tasks ) - task_ids_for_comments: set[str] = set() - for task_list in active_tasks_by_project.values(): - for task in task_list: - task_id = getattr(task, "id", None) - if task_id: - task_ids_for_comments.add(str(task_id)) + completed_task_ids_for_comments: set[str] = set() + skipped_completed_history = {} for task_list in completed_tasks_by_project.values(): for task in task_list: - task_id = getattr(task, "id", None) - if task_id: - task_ids_for_comments.add(str(task_id)) + key = make_completed_task_key_from_api(task) + if key is None: + continue + history_entry = history_by_key.get(key) + if history_entry: + skipped_completed_history[key] = history_entry + else: + completed_task_ids_for_comments.add(key[0]) + comments_by_task = fetch_comments_by_task( - api, comment_project_ids, sorted(task_ids_for_comments) + api, + active_comment_project_ids, + sorted(completed_task_ids_for_comments), ) - completed_history = load_completed_history() updated_history = {} data = [] for project in projects: @@ -460,6 +574,15 @@ def main(): processed_active = [process_task(t, comments_by_task) for t in active_tasks] processed_completed = [process_task(t, comments_by_task) for t in completed_tasks] + for task in processed_completed: + key = make_completed_task_key_from_dict(task) + history_entry = skipped_completed_history.get(key) if key else None + if history_entry: + if (not task.get('comments')) and history_entry.get('comments'): + task['comments'] = copy.deepcopy(history_entry['comments']) + if (not task.get('attachments')) and history_entry.get('attachments'): + task['attachments'] = copy.deepcopy(history_entry['attachments']) + # Build hierarchy for active tasks project_dict['tasks'] = build_task_hierarchy(processed_active)