Add API debug output and optimize comment fetching for completed tasks

2025-10-18 13:14:35 -04:00
parent c0cbbb00ba
commit e1f41307f2
2 changed files with 179 additions and 55 deletions
--- a/README.md
+++ b/README.md
@ -9,6 +9,7 @@ Todoist is a SaaS task manager. Todoist provides backups of current tasks, but d
 - Downloads attachments to `output/attachments/` and references them in the JSON and HTML output
 - JSON and HTML files are named with the current date when the script is run
 - Maintains `Todoist-Completed-History.json` so completed tasks older than Todoist's 90-day API window stay in future exports
+- Reuses archived comments for completed tasks to avoid unnecessary API calls (assumes no new comments after completion)

 ## Setup
 - Ensure you have Python 3.8 or newer installed. Check with `python --version` on the command line.
@ -26,8 +27,8 @@ Todoist is a SaaS task manager. Todoist provides backups of current tasks, but d
   ```bash
   python export_todoist.py export
   ```
-  This will create `output/Todoist-Actual-Backup-YYYY-MM-DD.json` and `output/Todoist-Actual-Backup-YYYY-MM-DD.html`, and it will update `output/attachments/` with any downloaded files while leaving `Todoist-Completed-History.json` in the project root.
-  Keep `Todoist-Completed-History.json` somewhere safe (e.g., in source control or a backup location); it is the only way the exporter can retain completions older than Todoist's 90-day API retention window.
+    This will create `output/Todoist-Actual-Backup-YYYY-MM-DD.json` and `output/Todoist-Actual-Backup-YYYY-MM-DD.html`, and it will update `output/attachments/` with any downloaded files while leaving `Todoist-Completed-History.json` in the project root.
+    Keep `Todoist-Completed-History.json` somewhere safe (e.g., in source control or a backup location); it is the only way the exporter can retain completions older than Todoist's 90-day API retention window.
 4. To see usage instructions, run the script with no arguments or any argument other than `export`.

 ## Requirements
--- a/export_todoist.py
+++ b/export_todoist.py
@ -4,7 +4,9 @@ import json
 import time
 import getpass
 import shutil
+import copy
 from collections import defaultdict
+from urllib.parse import quote_plus
 import requests
 from datetime import datetime, timedelta
 from todoist_api_python.api import TodoistAPI
@ -15,6 +17,12 @@ ATTACHMENTS_DIR = os.path.join(OUTPUT_DIR, "attachments")
 LEGACY_ATTACHMENTS_DIR = "attachments"
 TODOIST_API_TOKEN: str | None = None
 COMPLETED_HISTORY_FILE = "Todoist-Completed-History.json"
+COMMENT_REQUEST_MIN_INTERVAL = 0.5  # seconds
+COMMENT_MAX_ATTEMPTS = 8
+PROJECTS_URL = "https://api.todoist.com/rest/v2/projects"
+TASKS_URL = "https://api.todoist.com/rest/v2/tasks"
+COMPLETED_TASKS_URL = "https://api.todoist.com/api/v1/tasks/completed/by_completion_date"
+COMMENTS_URL = "https://api.todoist.com/api/v1/comments"


 def json_serial(obj):
@ -119,28 +127,63 @@ def normalize_timestamp(value):
    return str(value)


+def make_completed_task_key_from_dict(task):
+    task_id = str(task.get('id', '')) if isinstance(task, dict) else ""
+    if not task_id:
+        return None
+    completed_at = normalize_timestamp(task.get('completed_at'))
+    if not completed_at:
+        completed_at = normalize_timestamp(task.get('updated_at'))
+    return (task_id, completed_at)
+
+
+def make_completed_task_key_from_api(task):
+    task_id = getattr(task, "id", None)
+    if not task_id:
+        return None
+    completed_at = normalize_timestamp(getattr(task, "completed_at", None))
+    if not completed_at:
+        completed_at = normalize_timestamp(getattr(task, "updated_at", None))
+    return (str(task_id), completed_at)
+
+
 def merge_completed_lists(history_tasks, new_tasks):
    merged = []
-    seen = set()
+    index_by_key = {}

-    def make_key(task):
-        task_id = str(task.get('id', ''))
-        completed_at = normalize_timestamp(task.get('completed_at'))
-        if not completed_at:
-            completed_at = normalize_timestamp(task.get('updated_at'))
-        return (task_id, completed_at)
+    def merge_task_dicts(primary, secondary, prefer_primary=True):
+        for key, value in secondary.items():
+            if key == 'comments':
+                if (not primary.get('comments')) and value:
+                    primary['comments'] = value
+                continue
+            if key == 'attachments':
+                if (not primary.get('attachments')) and value:
+                    primary['attachments'] = value
+                continue
+            if key not in primary or primary[key] in (None, "", [], {}):
+                primary[key] = value
+                continue
+            if not prefer_primary:
+                primary[key] = value
+        return primary

-    def add_task(task):
-        key = make_key(task)
-        if key in seen:
+    def add_or_merge(task, prefer_existing=True):
+        key = make_completed_task_key_from_dict(task)
+        if key is None:
+            merged.append(task)
            return
-        seen.add(key)
-        merged.append(task)
+        if key in index_by_key:
+            idx = index_by_key[key]
+            merge_task_dicts(merged[idx], task, prefer_primary=prefer_existing)
+        else:
+            merged.append(task)
+            index_by_key[key] = len(merged) - 1

    for item in new_tasks:
-        add_task(item)
+        add_or_merge(item, prefer_existing=True)
    for item in history_tasks:
-        add_task(item)
+        add_or_merge(item, prefer_existing=True)

    def sort_key(task):
        completed_at = normalize_timestamp(task.get('completed_at'))
@ -232,11 +275,12 @@ def _get_retry_delay(response, attempt, base_delay=5, max_delay=120):
    return min(max_delay, base_delay * (2 ** attempt))


-def execute_with_rate_limit(func, *args, **kwargs):
+def execute_with_rate_limit(func, *args, max_attempts=5, request_desc=None, **kwargs):
    attempts = 0
-    max_attempts = 5
+    desc = request_desc or getattr(func, "__name__", "call")
    while True:
        try:
+            print(f"  Calling {desc}")
            return func(*args, **kwargs)
        except Exception as error:  # pylint: disable=broad-except
            status_code = getattr(error, "status_code", None)
@ -246,7 +290,9 @@ def execute_with_rate_limit(func, *args, **kwargs):
            if status_code == 429 and attempts < max_attempts:
                delay = _get_retry_delay(response, attempts)
                attempts += 1
-                print(f"Rate limit hit for {func.__name__}. Waiting {delay} seconds before retry {attempts}/{max_attempts}...")
+                print(f"  Rate limit hit for {desc}. Waiting {delay} seconds before retry {attempts}/{max_attempts}...")
+                if delay > 1:
+                    print(f"  Waiting {delay} seconds due to rate limiting")
                time.sleep(delay)
                continue
            raise
@ -255,7 +301,10 @@ def execute_with_rate_limit(func, *args, **kwargs):
 def fetch_all_projects(api):
    projects_by_id = {}
    try:
-        projects_iter = execute_with_rate_limit(api.get_projects)
+        projects_iter = execute_with_rate_limit(
+            api.get_projects,
+            request_desc=f"GET {PROJECTS_URL}"
+        )
        for batch in projects_iter:
            for project in batch:
                projects_by_id[str(getattr(project, "id", ""))] = project
@ -267,7 +316,10 @@ def fetch_all_projects(api):
 def fetch_active_tasks_by_project(api):
    tasks_by_project = defaultdict(list)
    try:
-        tasks_iter = execute_with_rate_limit(api.get_tasks)
+        tasks_iter = execute_with_rate_limit(
+            api.get_tasks,
+            request_desc=f"GET {TASKS_URL}"
+        )
        for batch in tasks_iter:
            for task in batch:
                tasks_by_project[str(getattr(task, "project_id", ""))].append(task)
@ -280,8 +332,10 @@ def fetch_active_tasks_by_project(api):
 def fetch_completed_tasks_by_project(api, since, until):
    tasks_by_project = defaultdict(list)
    try:
+        query = f"?since={since.isoformat()}&until={until.isoformat()}"
        completed_iter = execute_with_rate_limit(
            api.get_completed_tasks_by_completion_date,
+            request_desc=f"GET {COMPLETED_TASKS_URL}{query}",
            since=since,
            until=until,
        )
@ -297,29 +351,78 @@ def fetch_completed_tasks_by_project(api, since, until):
 def fetch_comments_by_task(api, project_ids, task_ids):
    comments_by_task = defaultdict(list)
    total_comments = 0
+    last_comment_call = 0.0
+
+    def throttled_get_comments(**kwargs):
+        nonlocal last_comment_call
+        elapsed = time.time() - last_comment_call
+        if elapsed < COMMENT_REQUEST_MIN_INTERVAL:
+            time.sleep(COMMENT_REQUEST_MIN_INTERVAL - elapsed)
+        params = []
+        for key, value in kwargs.items():
+            if value is None:
+                continue
+            params.append(f"{key}={quote_plus(str(value))}")
+        query = "&".join(params)
+        desc = f"GET {COMMENTS_URL}{('?' + query) if query else ''}"
+        result = execute_with_rate_limit(
+            api.get_comments,
+            max_attempts=COMMENT_MAX_ATTEMPTS,
+            request_desc=desc,
+            **kwargs,
+        )
+        last_comment_call = time.time()
+        return result
+
+    def handle_comment_error(scope, identifier, error):
+        status_code = getattr(error, "status_code", None)
+        response = getattr(error, "response", None)
+        if status_code is None and response is not None:
+            status_code = getattr(response, "status_code", None)
+        if status_code == 404:
+            print(f"  Comments not found for {scope} {identifier} (404). Skipping.")
+            return False
+        if status_code == 429:
+            delay = _get_retry_delay(response, COMMENT_MAX_ATTEMPTS)
+            print(
+                f"  Rate limit while fetching comments for {scope} {identifier} after retries; waiting {delay} seconds before continuing."
+            )
+            if delay > 1:
+                print(f"  Waiting {delay} seconds due to rate limiting")
+            time.sleep(delay)
+            return True
+        print(f"  Error fetching comments for {scope} {identifier}: {error}")
+        return False
+
    for project_id in project_ids:
-        try:
-            comments_iter = execute_with_rate_limit(api.get_comments, project_id=project_id)
-            for batch in comments_iter:
-                for comment in batch:
-                    task_id = str(getattr(comment, "task_id", ""))
-                    if task_id:
-                        comments_by_task[task_id].append(comment)
-                        total_comments += 1
-        except Exception as error:  # pylint: disable=broad-except
-            print(f"Error fetching comments for project {project_id}: {error}")
+        while True:
+            try:
+                comments_iter = throttled_get_comments(project_id=project_id)
+                for batch in comments_iter:
+                    for comment in batch:
+                        task_id = str(getattr(comment, "task_id", ""))
+                        if task_id:
+                            comments_by_task[task_id].append(comment)
+                            total_comments += 1
+                break
+            except Exception as error:  # pylint: disable=broad-except
+                if not handle_comment_error("project", project_id, error):
+                    break
    missing_task_ids = [task_id for task_id in task_ids if task_id not in comments_by_task]
    for task_id in missing_task_ids:
-        try:
-            comments_iter = execute_with_rate_limit(api.get_comments, task_id=task_id)
-            for batch in comments_iter:
-                for comment in batch:
-                    key = str(getattr(comment, "task_id", ""))
-                    if key:
-                        comments_by_task[key].append(comment)
-                        total_comments += 1
-        except Exception as error:  # pylint: disable=broad-except
-            print(f"Error fetching comments for task {task_id}: {error}")
+        while True:
+            try:
+                comments_iter = throttled_get_comments(task_id=task_id)
+                for batch in comments_iter:
+                    for comment in batch:
+                        key = str(getattr(comment, "task_id", ""))
+                        if key:
+                            comments_by_task[key].append(comment)
+                            total_comments += 1
+                break
+            except Exception as error:  # pylint: disable=broad-except
+                if not handle_comment_error("task", task_id, error):
+                    break
    print(
        f"Fetched {total_comments} comments mapped to {len(comments_by_task)} tasks"
    )
@ -429,26 +532,37 @@ def main():
    until = datetime.now()
    active_tasks_by_project = fetch_active_tasks_by_project(api)
    completed_tasks_by_project = fetch_completed_tasks_by_project(api, since=since, until=until)
-    comment_project_ids = sorted(
+    completed_history = load_completed_history()
+    history_by_key = {}
+    for task_list in completed_history.values():
+        for stored_task in task_list:
+            key = make_completed_task_key_from_dict(stored_task)
+            if key:
+                history_by_key[key] = stored_task
+
+    active_comment_project_ids = sorted(
        pid
-        for pid in (set(active_tasks_by_project.keys()) | set(completed_tasks_by_project.keys()))
-        if pid
+        for pid, tasks in active_tasks_by_project.items()
+        if pid and tasks
    )
-    task_ids_for_comments: set[str] = set()
-    for task_list in active_tasks_by_project.values():
-        for task in task_list:
-            task_id = getattr(task, "id", None)
-            if task_id:
-                task_ids_for_comments.add(str(task_id))
+    completed_task_ids_for_comments: set[str] = set()
+    skipped_completed_history = {}
    for task_list in completed_tasks_by_project.values():
        for task in task_list:
-            task_id = getattr(task, "id", None)
-            if task_id:
-                task_ids_for_comments.add(str(task_id))
+            key = make_completed_task_key_from_api(task)
+            if key is None:
+                continue
+            history_entry = history_by_key.get(key)
+            if history_entry:
+                skipped_completed_history[key] = history_entry
+            else:
+                completed_task_ids_for_comments.add(key[0])
+
    comments_by_task = fetch_comments_by_task(
-        api, comment_project_ids, sorted(task_ids_for_comments)
+        api,
+        active_comment_project_ids,
+        sorted(completed_task_ids_for_comments),
    )
-    completed_history = load_completed_history()
    updated_history = {}
    data = []
    for project in projects:
@ -460,6 +574,15 @@ def main():
        processed_active = [process_task(t, comments_by_task) for t in active_tasks]
        processed_completed = [process_task(t, comments_by_task) for t in completed_tasks]

+        for task in processed_completed:
+            key = make_completed_task_key_from_dict(task)
+            history_entry = skipped_completed_history.get(key) if key else None
+            if history_entry:
+                if (not task.get('comments')) and history_entry.get('comments'):
+                    task['comments'] = copy.deepcopy(history_entry['comments'])
+                if (not task.get('attachments')) and history_entry.get('attachments'):
+                    task['attachments'] = copy.deepcopy(history_entry['attachments'])
+
        # Build hierarchy for active tasks
        project_dict['tasks'] = build_task_hierarchy(processed_active)