cleantech_data/
bronze/
kaggle/
cleantech-media-dataset/YYYY-MM-DD/{ original.zip, raw.jsonl.gz, raw_manifest.jsonl, extracted/ }
cleantech-google-patent-dataset/YYYY-MM-DD/{ ... }
openalex/
YYYY-MM-DD/{ raw.jsonl.gz, raw_manifest.jsonl }
topics/YYYY-MM-DD/{ topics.jsonl.gz, raw_manifest.jsonl, extracted/topics.jsonl? }
silver/
media/<DATE>/{ media_canonical.*, media_dupe_links.*, media_excluded_*.* }
patents/<DATE>/{ patent_canonical.*, patent_dupe_links.*, patents_normalized.* }
openalex/<DATE>/{ topics_canonical.*, topic_*.*, *_ref.* }
unified/unified_docs.parquet