Add API debug output and optimize comment fetching for completed tasks

This commit is contained in:
2025-10-18 13:14:35 -04:00
parent c0cbbb00ba
commit e1f41307f2
2 changed files with 179 additions and 55 deletions

View File

@ -9,6 +9,7 @@ Todoist is a SaaS task manager. Todoist provides backups of current tasks, but d
- Downloads attachments to `output/attachments/` and references them in the JSON and HTML output - Downloads attachments to `output/attachments/` and references them in the JSON and HTML output
- JSON and HTML files are named with the current date when the script is run - JSON and HTML files are named with the current date when the script is run
- Maintains `Todoist-Completed-History.json` so completed tasks older than Todoist's 90-day API window stay in future exports - Maintains `Todoist-Completed-History.json` so completed tasks older than Todoist's 90-day API window stay in future exports
- Reuses archived comments for completed tasks to avoid unnecessary API calls (assumes no new comments after completion)
## Setup ## Setup
- Ensure you have Python 3.8 or newer installed. Check with `python --version` on the command line. - Ensure you have Python 3.8 or newer installed. Check with `python --version` on the command line.
@ -26,8 +27,8 @@ Todoist is a SaaS task manager. Todoist provides backups of current tasks, but d
```bash ```bash
python export_todoist.py export python export_todoist.py export
``` ```
This will create `output/Todoist-Actual-Backup-YYYY-MM-DD.json` and `output/Todoist-Actual-Backup-YYYY-MM-DD.html`, and it will update `output/attachments/` with any downloaded files while leaving `Todoist-Completed-History.json` in the project root. This will create `output/Todoist-Actual-Backup-YYYY-MM-DD.json` and `output/Todoist-Actual-Backup-YYYY-MM-DD.html`, and it will update `output/attachments/` with any downloaded files while leaving `Todoist-Completed-History.json` in the project root.
Keep `Todoist-Completed-History.json` somewhere safe (e.g., in source control or a backup location); it is the only way the exporter can retain completions older than Todoist's 90-day API retention window. Keep `Todoist-Completed-History.json` somewhere safe (e.g., in source control or a backup location); it is the only way the exporter can retain completions older than Todoist's 90-day API retention window.
4. To see usage instructions, run the script with no arguments or any argument other than `export`. 4. To see usage instructions, run the script with no arguments or any argument other than `export`.
## Requirements ## Requirements

View File

@ -4,7 +4,9 @@ import json
import time import time
import getpass import getpass
import shutil import shutil
import copy
from collections import defaultdict from collections import defaultdict
from urllib.parse import quote_plus
import requests import requests
from datetime import datetime, timedelta from datetime import datetime, timedelta
from todoist_api_python.api import TodoistAPI from todoist_api_python.api import TodoistAPI
@ -15,6 +17,12 @@ ATTACHMENTS_DIR = os.path.join(OUTPUT_DIR, "attachments")
LEGACY_ATTACHMENTS_DIR = "attachments" LEGACY_ATTACHMENTS_DIR = "attachments"
TODOIST_API_TOKEN: str | None = None TODOIST_API_TOKEN: str | None = None
COMPLETED_HISTORY_FILE = "Todoist-Completed-History.json" COMPLETED_HISTORY_FILE = "Todoist-Completed-History.json"
COMMENT_REQUEST_MIN_INTERVAL = 0.5 # seconds
COMMENT_MAX_ATTEMPTS = 8
PROJECTS_URL = "https://api.todoist.com/rest/v2/projects"
TASKS_URL = "https://api.todoist.com/rest/v2/tasks"
COMPLETED_TASKS_URL = "https://api.todoist.com/api/v1/tasks/completed/by_completion_date"
COMMENTS_URL = "https://api.todoist.com/api/v1/comments"
def json_serial(obj): def json_serial(obj):
@ -119,28 +127,63 @@ def normalize_timestamp(value):
return str(value) return str(value)
def make_completed_task_key_from_dict(task):
task_id = str(task.get('id', '')) if isinstance(task, dict) else ""
if not task_id:
return None
completed_at = normalize_timestamp(task.get('completed_at'))
if not completed_at:
completed_at = normalize_timestamp(task.get('updated_at'))
return (task_id, completed_at)
def make_completed_task_key_from_api(task):
task_id = getattr(task, "id", None)
if not task_id:
return None
completed_at = normalize_timestamp(getattr(task, "completed_at", None))
if not completed_at:
completed_at = normalize_timestamp(getattr(task, "updated_at", None))
return (str(task_id), completed_at)
def merge_completed_lists(history_tasks, new_tasks): def merge_completed_lists(history_tasks, new_tasks):
merged = [] merged = []
seen = set() index_by_key = {}
def make_key(task): def merge_task_dicts(primary, secondary, prefer_primary=True):
task_id = str(task.get('id', '')) for key, value in secondary.items():
completed_at = normalize_timestamp(task.get('completed_at')) if key == 'comments':
if not completed_at: if (not primary.get('comments')) and value:
completed_at = normalize_timestamp(task.get('updated_at')) primary['comments'] = value
return (task_id, completed_at) continue
if key == 'attachments':
if (not primary.get('attachments')) and value:
primary['attachments'] = value
continue
if key not in primary or primary[key] in (None, "", [], {}):
primary[key] = value
continue
if not prefer_primary:
primary[key] = value
return primary
def add_task(task): def add_or_merge(task, prefer_existing=True):
key = make_key(task) key = make_completed_task_key_from_dict(task)
if key in seen: if key is None:
merged.append(task)
return return
seen.add(key) if key in index_by_key:
merged.append(task) idx = index_by_key[key]
merge_task_dicts(merged[idx], task, prefer_primary=prefer_existing)
else:
merged.append(task)
index_by_key[key] = len(merged) - 1
for item in new_tasks: for item in new_tasks:
add_task(item) add_or_merge(item, prefer_existing=True)
for item in history_tasks: for item in history_tasks:
add_task(item) add_or_merge(item, prefer_existing=True)
def sort_key(task): def sort_key(task):
completed_at = normalize_timestamp(task.get('completed_at')) completed_at = normalize_timestamp(task.get('completed_at'))
@ -232,11 +275,12 @@ def _get_retry_delay(response, attempt, base_delay=5, max_delay=120):
return min(max_delay, base_delay * (2 ** attempt)) return min(max_delay, base_delay * (2 ** attempt))
def execute_with_rate_limit(func, *args, **kwargs): def execute_with_rate_limit(func, *args, max_attempts=5, request_desc=None, **kwargs):
attempts = 0 attempts = 0
max_attempts = 5 desc = request_desc or getattr(func, "__name__", "call")
while True: while True:
try: try:
print(f" Calling {desc}")
return func(*args, **kwargs) return func(*args, **kwargs)
except Exception as error: # pylint: disable=broad-except except Exception as error: # pylint: disable=broad-except
status_code = getattr(error, "status_code", None) status_code = getattr(error, "status_code", None)
@ -246,7 +290,9 @@ def execute_with_rate_limit(func, *args, **kwargs):
if status_code == 429 and attempts < max_attempts: if status_code == 429 and attempts < max_attempts:
delay = _get_retry_delay(response, attempts) delay = _get_retry_delay(response, attempts)
attempts += 1 attempts += 1
print(f"Rate limit hit for {func.__name__}. Waiting {delay} seconds before retry {attempts}/{max_attempts}...") print(f" Rate limit hit for {desc}. Waiting {delay} seconds before retry {attempts}/{max_attempts}...")
if delay > 1:
print(f" Waiting {delay} seconds due to rate limiting")
time.sleep(delay) time.sleep(delay)
continue continue
raise raise
@ -255,7 +301,10 @@ def execute_with_rate_limit(func, *args, **kwargs):
def fetch_all_projects(api): def fetch_all_projects(api):
projects_by_id = {} projects_by_id = {}
try: try:
projects_iter = execute_with_rate_limit(api.get_projects) projects_iter = execute_with_rate_limit(
api.get_projects,
request_desc=f"GET {PROJECTS_URL}"
)
for batch in projects_iter: for batch in projects_iter:
for project in batch: for project in batch:
projects_by_id[str(getattr(project, "id", ""))] = project projects_by_id[str(getattr(project, "id", ""))] = project
@ -267,7 +316,10 @@ def fetch_all_projects(api):
def fetch_active_tasks_by_project(api): def fetch_active_tasks_by_project(api):
tasks_by_project = defaultdict(list) tasks_by_project = defaultdict(list)
try: try:
tasks_iter = execute_with_rate_limit(api.get_tasks) tasks_iter = execute_with_rate_limit(
api.get_tasks,
request_desc=f"GET {TASKS_URL}"
)
for batch in tasks_iter: for batch in tasks_iter:
for task in batch: for task in batch:
tasks_by_project[str(getattr(task, "project_id", ""))].append(task) tasks_by_project[str(getattr(task, "project_id", ""))].append(task)
@ -280,8 +332,10 @@ def fetch_active_tasks_by_project(api):
def fetch_completed_tasks_by_project(api, since, until): def fetch_completed_tasks_by_project(api, since, until):
tasks_by_project = defaultdict(list) tasks_by_project = defaultdict(list)
try: try:
query = f"?since={since.isoformat()}&until={until.isoformat()}"
completed_iter = execute_with_rate_limit( completed_iter = execute_with_rate_limit(
api.get_completed_tasks_by_completion_date, api.get_completed_tasks_by_completion_date,
request_desc=f"GET {COMPLETED_TASKS_URL}{query}",
since=since, since=since,
until=until, until=until,
) )
@ -297,29 +351,78 @@ def fetch_completed_tasks_by_project(api, since, until):
def fetch_comments_by_task(api, project_ids, task_ids): def fetch_comments_by_task(api, project_ids, task_ids):
comments_by_task = defaultdict(list) comments_by_task = defaultdict(list)
total_comments = 0 total_comments = 0
last_comment_call = 0.0
def throttled_get_comments(**kwargs):
nonlocal last_comment_call
elapsed = time.time() - last_comment_call
if elapsed < COMMENT_REQUEST_MIN_INTERVAL:
time.sleep(COMMENT_REQUEST_MIN_INTERVAL - elapsed)
params = []
for key, value in kwargs.items():
if value is None:
continue
params.append(f"{key}={quote_plus(str(value))}")
query = "&".join(params)
desc = f"GET {COMMENTS_URL}{('?' + query) if query else ''}"
result = execute_with_rate_limit(
api.get_comments,
max_attempts=COMMENT_MAX_ATTEMPTS,
request_desc=desc,
**kwargs,
)
last_comment_call = time.time()
return result
def handle_comment_error(scope, identifier, error):
status_code = getattr(error, "status_code", None)
response = getattr(error, "response", None)
if status_code is None and response is not None:
status_code = getattr(response, "status_code", None)
if status_code == 404:
print(f" Comments not found for {scope} {identifier} (404). Skipping.")
return False
if status_code == 429:
delay = _get_retry_delay(response, COMMENT_MAX_ATTEMPTS)
print(
f" Rate limit while fetching comments for {scope} {identifier} after retries; waiting {delay} seconds before continuing."
)
if delay > 1:
print(f" Waiting {delay} seconds due to rate limiting")
time.sleep(delay)
return True
print(f" Error fetching comments for {scope} {identifier}: {error}")
return False
for project_id in project_ids: for project_id in project_ids:
try: while True:
comments_iter = execute_with_rate_limit(api.get_comments, project_id=project_id) try:
for batch in comments_iter: comments_iter = throttled_get_comments(project_id=project_id)
for comment in batch: for batch in comments_iter:
task_id = str(getattr(comment, "task_id", "")) for comment in batch:
if task_id: task_id = str(getattr(comment, "task_id", ""))
comments_by_task[task_id].append(comment) if task_id:
total_comments += 1 comments_by_task[task_id].append(comment)
except Exception as error: # pylint: disable=broad-except total_comments += 1
print(f"Error fetching comments for project {project_id}: {error}") break
except Exception as error: # pylint: disable=broad-except
if not handle_comment_error("project", project_id, error):
break
missing_task_ids = [task_id for task_id in task_ids if task_id not in comments_by_task] missing_task_ids = [task_id for task_id in task_ids if task_id not in comments_by_task]
for task_id in missing_task_ids: for task_id in missing_task_ids:
try: while True:
comments_iter = execute_with_rate_limit(api.get_comments, task_id=task_id) try:
for batch in comments_iter: comments_iter = throttled_get_comments(task_id=task_id)
for comment in batch: for batch in comments_iter:
key = str(getattr(comment, "task_id", "")) for comment in batch:
if key: key = str(getattr(comment, "task_id", ""))
comments_by_task[key].append(comment) if key:
total_comments += 1 comments_by_task[key].append(comment)
except Exception as error: # pylint: disable=broad-except total_comments += 1
print(f"Error fetching comments for task {task_id}: {error}") break
except Exception as error: # pylint: disable=broad-except
if not handle_comment_error("task", task_id, error):
break
print( print(
f"Fetched {total_comments} comments mapped to {len(comments_by_task)} tasks" f"Fetched {total_comments} comments mapped to {len(comments_by_task)} tasks"
) )
@ -429,26 +532,37 @@ def main():
until = datetime.now() until = datetime.now()
active_tasks_by_project = fetch_active_tasks_by_project(api) active_tasks_by_project = fetch_active_tasks_by_project(api)
completed_tasks_by_project = fetch_completed_tasks_by_project(api, since=since, until=until) completed_tasks_by_project = fetch_completed_tasks_by_project(api, since=since, until=until)
comment_project_ids = sorted( completed_history = load_completed_history()
history_by_key = {}
for task_list in completed_history.values():
for stored_task in task_list:
key = make_completed_task_key_from_dict(stored_task)
if key:
history_by_key[key] = stored_task
active_comment_project_ids = sorted(
pid pid
for pid in (set(active_tasks_by_project.keys()) | set(completed_tasks_by_project.keys())) for pid, tasks in active_tasks_by_project.items()
if pid if pid and tasks
) )
task_ids_for_comments: set[str] = set() completed_task_ids_for_comments: set[str] = set()
for task_list in active_tasks_by_project.values(): skipped_completed_history = {}
for task in task_list:
task_id = getattr(task, "id", None)
if task_id:
task_ids_for_comments.add(str(task_id))
for task_list in completed_tasks_by_project.values(): for task_list in completed_tasks_by_project.values():
for task in task_list: for task in task_list:
task_id = getattr(task, "id", None) key = make_completed_task_key_from_api(task)
if task_id: if key is None:
task_ids_for_comments.add(str(task_id)) continue
history_entry = history_by_key.get(key)
if history_entry:
skipped_completed_history[key] = history_entry
else:
completed_task_ids_for_comments.add(key[0])
comments_by_task = fetch_comments_by_task( comments_by_task = fetch_comments_by_task(
api, comment_project_ids, sorted(task_ids_for_comments) api,
active_comment_project_ids,
sorted(completed_task_ids_for_comments),
) )
completed_history = load_completed_history()
updated_history = {} updated_history = {}
data = [] data = []
for project in projects: for project in projects:
@ -460,6 +574,15 @@ def main():
processed_active = [process_task(t, comments_by_task) for t in active_tasks] processed_active = [process_task(t, comments_by_task) for t in active_tasks]
processed_completed = [process_task(t, comments_by_task) for t in completed_tasks] processed_completed = [process_task(t, comments_by_task) for t in completed_tasks]
for task in processed_completed:
key = make_completed_task_key_from_dict(task)
history_entry = skipped_completed_history.get(key) if key else None
if history_entry:
if (not task.get('comments')) and history_entry.get('comments'):
task['comments'] = copy.deepcopy(history_entry['comments'])
if (not task.get('attachments')) and history_entry.get('attachments'):
task['attachments'] = copy.deepcopy(history_entry['attachments'])
# Build hierarchy for active tasks # Build hierarchy for active tasks
project_dict['tasks'] = build_task_hierarchy(processed_active) project_dict['tasks'] = build_task_hierarchy(processed_active)