Skip to content

Cleantech Silver Validation + Unification (Latest Buckets Version 2025-08-09)

This notebook reads the latest Silver buckets for: - mediamedia_canonical, media_dupe_links - patentspatent_canonical, patent_dupe_links, patents_normalized - openalextopics_canonical, topic_keywords_m2m, topic_siblings_m2m, domains_ref, fields_ref, subfields_ref

Then it unifies the three sources into a single Parquet file at:

Text Only
cleantech_data/silver/unified/unified_docs.parquet

Minimal unified schema

Text Only
doc_id, doc_type, title, text, date, lang, source, url, country, cpc_codes
- doc_type ∈ {"media","patent","topic"} - date is UTC-naive datetime64[ns] - lang is ISO‑639‑1 where available, else "unknown" - country only for patents (2-letter code), else NA - cpc_codes is a list (empty list for non‑patents)

Notes: - The notebook tolerates either Parquet or CSV.GZ Silver outputs (because your safe_write may fall back to CSV). - It adapts to the updated OpenAlex topic structure (created_date / updated_date, lang_description, lang_display_name).

Python
from __future__ import annotations

import os, re, ast, json, hashlib
from pathlib import Path
from datetime import datetime
from typing import Optional, Iterable, Any, List, Dict

import pandas as pd

try:
    from IPython.display import display, Markdown
    _HAS_RICH = True
except Exception:
    _HAS_RICH = False
    def display(x): print(x)
    def Markdown(x): return x

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 160)
pd.set_option("display.max_colwidth", 160)

# ---------------------------- small IO helpers ----------------------------

def find_file(base_dir: str | Path, stem: str) -> Optional[Path]:
    base = Path(base_dir)
    if not base.exists():
        return None
    exts = (".parquet", ".pq", ".csv.gz", ".csv")
    for ext in exts:
        p = base / f"{stem}{ext}"
        if p.exists():
            return p
    cands = []
    for ext in exts:
        cands += list(base.glob(f"{stem}*{ext}"))
    if not cands:
        return None
    def _score(p: Path) -> tuple[int, str]:
        suf = p.suffix.lower()
        if suf in (".parquet", ".pq"): return (0, p.name)
        if suf == ".gz": return (1, p.name)
        return (2, p.name)
    cands.sort(key=_score)
    return cands[0]

def _read_parquet(path: Path, n_rows: Optional[int]) -> pd.DataFrame:
    df = pd.read_parquet(path)
    return df.head(n_rows) if n_rows else df

def _read_csv_like(path: Path, n_rows: Optional[int]) -> pd.DataFrame:
    if path.suffix.lower() == ".gz" or path.name.endswith(".csv.gz"):
        return pd.read_csv(path, compression="infer", nrows=n_rows, low_memory=False)
    return pd.read_csv(path, nrows=n_rows, low_memory=False)

def read_any(path: str | Path, n_rows: Optional[int] = None) -> pd.DataFrame:
    p = Path(path)
    suf = p.suffix.lower()
    if suf in (".parquet", ".pq"):
        try: return _read_parquet(p, n_rows)
        except Exception: return _read_csv_like(p, n_rows)
    if suf in (".csv", ".gz"):
        try: return _read_csv_like(p, n_rows)
        except Exception: return _read_parquet(p, n_rows)
    try: return _read_parquet(p, n_rows)
    except Exception: return _read_csv_like(p, n_rows)

# ------------------------- profiling display utils ------------------------

def _maybe_drop_header_row(df: pd.DataFrame):
    if df.empty: return df, False
    try:
        first = df.iloc[0].astype(str).tolist()
        cols  = [str(c) for c in df.columns.tolist()]
        if first == cols:
            return df.iloc[1:].reset_index(drop=True), True
    except Exception: pass
    return df, False

def _dtypes_table(df: pd.DataFrame) -> pd.DataFrame:
    s = df.dtypes.astype(str)
    out = s.to_frame(name="dtype")
    order = {"bool":0,"boolean":0,"Int64":1,"int64":1,"Float64":2,"float64":2,"string":3,"object":4,"datetime64[ns]":5,"category":6}
    rank = out["dtype"].map(lambda t: order.get(t, 99))
    return out.assign(_rank=rank).sort_values(["_rank","dtype"]).drop(columns="_rank")

def _show_df(df: pd.DataFrame, n: int = 5):
    if _HAS_RICH: display(df.head(n))
    else: print(df.head(n).to_string())

def basic_profile(df: pd.DataFrame, name: str = "", head_n: int = 5) -> pd.DataFrame:
    if name: display(Markdown(f"### {name}"))
    print("shape:", df.shape)
    df2, dropped = _maybe_drop_header_row(df)
    if dropped: print("[fix] Dropped first row that duplicated the header names.")
    display(Markdown("**Dtypes**")); display(_dtypes_table(df2))
    display(Markdown("**Top 20 Null Fractions**"))
    na = df2.isna().mean().sort_values(ascending=False).to_frame("na_frac")
    _show_df(na, n=min(20, len(na)))
    display(Markdown(f"**Head({head_n})**"))
    with pd.option_context("display.max_colwidth", 140):
        _show_df(df2, n=head_n)
    return df2

def bar_top(df: pd.DataFrame, col: str, n: int = 20, title: Optional[str] = None):
    try:
        import matplotlib.pyplot as plt
        vc = df[col].value_counts(dropna=False).head(n)
        plt.figure(figsize=(10,4)); vc.plot(kind="bar"); plt.xticks(rotation=45, ha="right")
        if title: plt.title(title)
        plt.tight_layout(); plt.show()
    except Exception:
        display((df[col].value_counts(dropna=False).head(n)).to_frame("count"))

def date_range(df: pd.DataFrame, col: str) -> pd.DataFrame:
    if col not in df.columns:
        return pd.DataFrame({"non_null":[0], "min":[pd.NaT], "max":[pd.NaT]}, index=[col])
    s = pd.to_datetime(df[col], errors="coerce")
    return pd.DataFrame({"non_null":[int(s.notna().sum())], "min":[s.min()], "max":[s.max()]}, index=[col])

# Pretty coverage helper
def show_frac(label: str, frac) -> None:
    try:
        v = float(frac)
    except Exception:
        v = float("nan")
    display(Markdown(f"**{label}:** `{v:.4f}`"))

# -------------------------- environment discovery -------------------------

_DATE_RE = re.compile(r"^\d{4}-\d{2}-\d{2}$")

def _repo_root_from_cwd() -> Path:
    p = Path.cwd()
    for q in [p] + list(p.parents):
        if (q / ".git").exists() or (q / "pyproject.toml").exists():
            return q
    return p

def discover_data_root() -> Path:
    env = os.getenv("CLEANTECH_DATA_DIR")
    if env and Path(env).exists(): return Path(env).resolve()
    repo = _repo_root_from_cwd()
    preferred = (repo / "cleantech_data").resolve()
    if preferred.exists(): return preferred
    alt = (repo / "notebooks" / "cleantech_data").resolve()
    if alt.exists(): return alt
    return (Path.cwd() / "cleantech_data").resolve()

def latest_bucket(base: Path) -> Path:
    if not base.exists(): return base
    dated, others = [], []
    for d in base.iterdir():
        if d.is_dir():
            try:
                dt = datetime.strptime(d.name, "%Y-%m-%d").date()
                dated.append((dt, d))
            except Exception:
                others.append(d)
    if dated:
        dated.sort(key=lambda x: x[0])
        return dated[-1][1]
    if others:
        return max(others, key=lambda p: p.stat().st_mtime)
    return base

def silver_bucket(dataset: str, root: Path) -> Path:
    return latest_bucket(root / "silver" / dataset)

def list_dir_files(p: Path) -> pd.DataFrame:
    rows = []
    if p.exists():
        for f in p.iterdir():
            if f.is_file():
                try:
                    rows.append({"file": f.name, "bytes": f.stat().st_size, "modified": datetime.fromtimestamp(f.stat().st_mtime)})
                except Exception:
                    rows.append({"file": f.name, "bytes": None, "modified": None})
    df = pd.DataFrame(rows)
    if df.empty: return df
    if "modified" in df.columns:
        df = df.sort_values("modified", ascending=False)
    if "bytes" in df.columns:
        def _fmt(n): 
            if n is None: return None
            x = float(n)
            for u in ["B","KB","MB","GB","TB"]:
                if x < 1024: return f"{x:.1f} {u}"
                x /= 1024
            return f"{x:.1f} PB"
        df["size"] = df["bytes"].map(_fmt)
    return df

# Media dtype coercion (bool/Int typed cleanly)
def coerce_media_dtypes(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if "is_listing" in df.columns:
        def _to_bool(x):
            if pd.isna(x): return pd.NA
            s = str(x).strip().lower()
            if s in {"true","1","yes"}: return True
            if s in {"false","0","no"}: return False
            return pd.NA
        df["is_listing"] = df["is_listing"].map(_to_bool).astype("boolean")
    for c in ("content_char_count","content_word_count"):
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce").astype("Int64")
    if "published_dt" in df.columns:
        df["published_dt"] = pd.to_datetime(df["published_dt"], errors="coerce")
    return df

# Robust list cardinality (patents etc.)
def cell_cardinality(x) -> int:
    if x is None:
        return 0
    try:
        is_na = pd.isna(x)
        if isinstance(is_na, (bool, type(pd.NA))):
            if bool(is_na): return 0
    except Exception:
        pass
    if isinstance(x, (list, tuple, set)):
        return len(x)
    try:
        import numpy as np
        if isinstance(x, (np.ndarray, pd.api.extensions.ExtensionArray)):
            return int(getattr(x, "size", len(x)))
    except Exception:
        pass
    if isinstance(x, str):
        s = x.strip()
        if len(s) >= 2 and s[0] == "[" and s[-1] == "]":
            try:
                parsed = ast.literal_eval(s)
                if isinstance(parsed, (list, tuple, set)):
                    return len(parsed)
            except Exception:
                pass
        return 0 if s == "" else 1
    return 1
Python
root = discover_data_root()
display(Markdown(f"**Data root (chosen):** `{root}`"))

media_dir    = silver_bucket("media", root)
patents_dir  = silver_bucket("patents", root)
openalex_dir = silver_bucket("openalex", root)

display(Markdown("**Latest Silver buckets**"))
display({"media": str(media_dir), "patents": str(patents_dir), "openalex": str(openalex_dir)})

display(Markdown("#### Files in Media bucket"));   display(list_dir_files(media_dir))
display(Markdown("#### Files in Patents bucket")); display(list_dir_files(patents_dir))
display(Markdown("#### Files in OpenAlex bucket"));display(list_dir_files(openalex_dir))

Data root (chosen): C:\Users\gerbe\PycharmProjects\MT\cleantech_data

Latest Silver buckets

Text Only
1
2
3
{'media': 'C:\\Users\\gerbe\\PycharmProjects\\MT\\cleantech_data\\silver\\media\\2025-08-09',
 'patents': 'C:\\Users\\gerbe\\PycharmProjects\\MT\\cleantech_data\\silver\\patents\\2025-08-09',
 'openalex': 'C:\\Users\\gerbe\\PycharmProjects\\MT\\cleantech_data\\silver\\openalex\\2025-08-09'}

Files in Media bucket

file bytes modified size
1 media_dupe_links.parquet 13711 2025-08-24 17:35:16.331749 13.4 KB
0 media_canonical.parquet 147587328 2025-08-24 17:35:16.295887 140.8 MB
2 media_excluded_non_articles.parquet 2690049 2025-08-24 17:11:55.436329 2.6 MB

Files in Patents bucket

file bytes modified size
0 patents_normalized.parquet 17524884 2025-08-24 12:22:59.683251 16.7 MB
2 patent_dupe_links.parquet 745944 2025-08-24 12:22:59.338741 728.5 KB
1 patent_canonical.parquet 33063620 2025-08-24 12:22:59.126081 31.5 MB

Files in OpenAlex bucket

file bytes modified size
2 subfields_ref.parquet 6953 2025-08-24 12:30:08.482854 6.8 KB
1 fields_ref.parquet 2249 2025-08-24 12:30:08.478860 2.2 KB
0 domains_ref.parquet 1649 2025-08-24 12:30:08.475859 1.6 KB
5 topic_siblings_m2m.parquet 439273 2025-08-24 12:30:08.469853 429.0 KB
4 topic_keywords_m2m.parquet 943805 2025-08-24 12:30:08.417989 921.7 KB
3 topics_canonical.parquet 1109773 2025-08-24 12:30:08.388933 1.1 MB
Python
# MEDIA — canonical & duplicate links
media_canon_path = find_file(media_dir, "media_canonical")
media_links_path = find_file(media_dir, "media_dupe_links")

if media_canon_path and media_canon_path.exists():
    display(Markdown(f"**media_canonical:** `{media_canon_path}`"))
    media = read_any(media_canon_path)
    media = coerce_media_dtypes(media)
    media = basic_profile(media, "media_canonical")
    for c in ["content_char_count","content_word_count"]:
        if c in media.columns and pd.api.types.is_numeric_dtype(media[c]):
            display(Markdown(f"**Describe numeric: `{c}`**")); display(media[c].describe().to_frame(c))
    for c in ["domain","lang_title","lang_content"]:
        if c in media.columns:
            display(Markdown(f"Top values in `{c}`")); display(media[c].value_counts().head(15).to_frame("count"))
    if "published_dt" in media.columns:
        display(Markdown("**Date coverage for `published_dt`**")); display(date_range(media, "published_dt"))
else:
    display(Markdown("> ⚠️ `media_canonical` not found."))

if media_links_path and media_links_path.exists():
    display(Markdown(f"**media_dupe_links:** `{media_links_path}`"))
    mlinks = read_any(media_links_path)
    mlinks = basic_profile(mlinks, "media_dupe_links")
    if "reason" in mlinks.columns:
        display(Markdown("**Duplicate reasons (media_dupe_links)**"))
        display(mlinks["reason"].value_counts().to_frame("count"))
else:
    display(Markdown("> ⚠️ `media_dupe_links` not found."))

media_canonical: C:\Users\gerbe\PycharmProjects\MT\cleantech_data\silver\media\2025-08-09\media_canonical.parquet

media_canonical

Text Only
1
shape: (19341, 24)

Dtypes

dtype
is_article_like bool
is_listing boolean
content_char_count Int64
content_word_count Int64
author float64
lang_title_score float64
lang_content_score float64
title object
date object
content object
domain object
url object
title_clean object
content_rawtext object
content_clean object
lang_title object
lang_content object
content_sha1 object
title_fp object
url_key object
non_article_reason object
_root object
url_clean object
published_dt datetime64[ns]

Top 20 Null Fractions

na_frac
author 1.0
title 0.0
date 0.0
content 0.0
domain 0.0
url 0.0
is_listing 0.0
title_clean 0.0
content_rawtext 0.0
content_clean 0.0
lang_title 0.0
lang_content 0.0
lang_title_score 0.0
lang_content_score 0.0
content_char_count 0.0
content_word_count 0.0
content_sha1 0.0
title_fp 0.0
published_dt 0.0
url_key 0.0

Head(5)

title date author content domain url is_listing title_clean content_rawtext content_clean lang_title lang_content lang_title_score lang_content_score content_char_count content_word_count content_sha1 title_fp published_dt url_key non_article_reason is_article_like _root url_clean
0 Zambia set for 60 MW/20 MWh of solar, storage – pv magazine International 2024-05-06 NaN ['Turkish developer YEO and Zambian sustainable energy company are constructing a 60 MW solar plant with a 20 MWh battery energy storage... pv-magazine.com http://pv-magazine.com/2024/05/06/zambia-set-for-60-mw-20-mwh-of-solar-storage/ False Zambia set for 60 MW/20 MWh of solar, storage – pv magazine International ['Turkish developer YEO and Zambian sustainable energy company are constructing a 60 MW solar plant with a 20 MWh battery energy storage... Turkish developer YEO and Zambian sustainable energy company are constructing a 60 MW solar plant with a 20 MWh battery energy storage s... en en 0.999994 1.0 3602 589 5eca9aa2862632d015697f5fa0a67536e4490d93 20|60|international|magazine|mw|mwh|pv|set|solar|storage|zambia 2024-05-06 pv-magazine.com/2024/05/06/zambia-set-for-60-mw-20-mwh-of-solar-storage True http://pv-magazine.com/2024/05/06/zambia-set-for-60-mw-20-mwh-of-solar-storage/ http://pv-magazine.com/2024/05/06/zambia-set-for-60-mw-20-mwh-of-solar-storage/
1 All-Energy and Dcarbonise Conference Programme 2024-05-16 NaN ['The renewables and low carbon energy community relishes attending All-Energy and Dcarbonise. In 2023, over 9,706 participants attended... www.all-energy.co.uk http://www.all-energy.co.uk/cfs False All-Energy and Dcarbonise Conference Programme ['The renewables and low carbon energy community relishes attending All-Energy and Dcarbonise. In 2023, over 9,706 participants attended... The renewables and low carbon energy community relishes attending All-Energy and Dcarbonise. In 2023, over 9,706 participants attended t... en en 1.000000 1.0 2664 418 fc0b77c650cd9da58af79f07eb2f7b6a26540674 all|conference|dcarbonise|energy|programme 2024-05-16 all-energy.co.uk/cfs True http://www.all-energy.co.uk/cfs http://www.all-energy.co.uk/cfs
2 API Challenging Court’ s Decision to Nullify GOM Oil, Natural Gas Auction 2022-02-09 NaN ['Sign in to get the best natural gas news and data. Follow the topics you want and receive the daily emails.', 'Your email address *', ... www.naturalgasintel.com http://www.naturalgasintel.com/api-challenging-courts-decision-to-nullify-gom-oil-natural-gas-auction/ False API Challenging Court’ s Decision to Nullify GOM Oil, Natural Gas Auction ['Sign in to get the best natural gas news and data. Follow the topics you want and receive the daily emails.', 'Your email address *', ... Follow the topics you want and receive the daily emails.\nYour email address *\nYour password *\nRemember me Continue\nReset password\nF... en en 0.999987 1.0 4602 751 bd9c3daf2d6c6f2667cba3cc1ccf4af8c34059bf api|auction|challenging|court|decision|gas|gom|natural|nullify|oil 2022-02-09 naturalgasintel.com/api-challenging-courts-decision-to-nullify-gom-oil-natural-gas-auction True http://www.naturalgasintel.com/api-challenging-courts-decision-to-nullify-gom-oil-natural-gas-auction/ http://www.naturalgasintel.com/api-challenging-courts-decision-to-nullify-gom-oil-natural-gas-auction/
3 Arizona Regulator Blocks Salt River Natural Gas Power Plant Expansion 2022-04-14 NaN ['Sign in to get the best natural gas news and data. Follow the topics you want and receive the daily emails.', 'Your email address *', ... www.naturalgasintel.com http://www.naturalgasintel.com/arizona-regulator-blocks-salt-river-natural-gas-power-plant-expansion/ False Arizona Regulator Blocks Salt River Natural Gas Power Plant Expansion ['Sign in to get the best natural gas news and data. Follow the topics you want and receive the daily emails.', 'Your email address *', ... Follow the topics you want and receive the daily emails.\nYour email address *\nYour password *\nRemember me Continue\nReset password\nF... ca en 0.510561 1.0 6740 1106 c0817579053284ca7921a1da09fc1d7720b1d81f arizona|blocks|expansion|gas|natural|plant|power|regulator|river|salt 2022-04-14 naturalgasintel.com/arizona-regulator-blocks-salt-river-natural-gas-power-plant-expansion True http://www.naturalgasintel.com/arizona-regulator-blocks-salt-river-natural-gas-power-plant-expansion/ http://www.naturalgasintel.com/arizona-regulator-blocks-salt-river-natural-gas-power-plant-expansion/
4 Biden Administration's 2023 Budget Doubles Down on Energy Transition 2022-03-29 NaN ['Sign in to get the best natural gas news and data. Follow the topics you want and receive the daily emails.', 'Your email address *', ... www.naturalgasintel.com http://www.naturalgasintel.com/biden-administrations-2023-budget-doubles-down-on-energy-transition/ False Biden Administration's 2023 Budget Doubles Down on Energy Transition ['Sign in to get the best natural gas news and data. Follow the topics you want and receive the daily emails.', 'Your email address *', ... Follow the topics you want and receive the daily emails.\nYour email address *\nYour password *\nRemember me Continue\nReset password\nF... en en 1.000000 1.0 5714 941 fc8285deefa73230c821637e0d4cd20b29eba383 2023|administration|biden|budget|doubles|down|energy|transition 2022-03-29 naturalgasintel.com/biden-administrations-2023-budget-doubles-down-on-energy-transition True http://www.naturalgasintel.com/biden-administrations-2023-budget-doubles-down-on-energy-transition/ http://www.naturalgasintel.com/biden-administrations-2023-budget-doubles-down-on-energy-transition/

Describe numeric: content_char_count

content_char_count
count 19341.0
mean 4723.312393
std 2782.727649
min 305.0
25% 2563.0
50% 4115.0
75% 6413.0
max 19684.0

Describe numeric: content_word_count

content_word_count
count 19341.0
mean 748.544181
std 444.552426
min 50.0
25% 406.0
50% 654.0
75% 1007.0
max 2995.0

Top values in domain

count
domain
www.energy-xprt.com 4058
www.pv-magazine.com 2945
www.azocleantech.com 2457
www.pv-tech.org 1966
cleantechnica.com 1752
www.thinkgeoenergy.com 1052
www.solarpowerportal.co.uk 850
www.solarpowerworldonline.com 778
www.energyvoice.com 635
solarindustrymag.com 620
solarquarter.com 590
www.rechargenews.com 557
www.naturalgasintel.com 295
www.energyintel.com 170
www.iea.org 158

Top values in lang_title

count
lang_title
en 17075
it 809
hu 553
de 171
es 118
nl 103
fr 102
mt 66
da 37
pt 24
tr 24
eu 24
no 21
pl 17
fi 16

Top values in lang_content

count
lang_content
en 19334
la 2
de 2
it 1
ru 1
es 1

Date coverage for published_dt

non_null min max
published_dt 19341 2022-01-02 2024-10-24

media_dupe_links: C:\Users\gerbe\PycharmProjects\MT\cleantech_data\silver\media\2025-08-09\media_dupe_links.parquet

Text Only
1
shape: (149, 4)

Dtypes

dtype
member_id object
canonical_id object
reason object
group_key object

Top 20 Null Fractions

na_frac
member_id 0.0
canonical_id 0.0
reason 0.0
group_key 0.0

Head(5)

member_id canonical_id reason group_key
0 https://indorenergy.com/to-exhibit/ https://indorenergy.com/to-exhibit/ union
1 https://indorenergy.com/to-exhibit/ https://indorenergy.com/to-exhibit/ union
2 https://www.azocleantech.com/Suppliers.aspx?SupplierID=1870 https://www.azocleantech.com/suppliers.aspx?SupplierID=1870 content_sha1|title_fp_in_domain_len90_dt7d 81e3be4966af51df918bcaeca6738445d3a63e5d|www.azocleantech.com||address|contact|inc|materials|mosaic|quotes
3 https://www.energy-xprt.com/applications/page-4 https://www.energy-xprt.com/applications/page-5 title_fp_in_domain_len90_dt7d www.energy-xprt.com||applications|energy|industry|page
4 https://www.energy-xprt.com/bioenergy/anaerobic-digestion/products/keyword-anaerobic-digester-system-62965 https://www.energy-xprt.com/bioenergy/anaerobic-digestion/products/keyword-anaerobic-digestion-system-19516 content_sha1 d255ef33658f43787e67a2c9feed5836bbc219cd

Duplicate reasons (media_dupe_links)

count
reason
title_fp_in_domain_len90_dt7d 91
content_sha1 33
union 17
content_sha1|title_fp_in_domain_len90_dt7d 8
Python
# PATENTS — canonical, duplicate links, normalized
pat_canon_path = find_file(patents_dir, "patent_canonical")
pat_links_path = find_file(patents_dir, "patent_dupe_links")
pat_norm_path  = find_file(patents_dir, "patents_normalized")

if pat_canon_path and pat_canon_path.exists():
    display(Markdown(f"**patent_canonical:** `{pat_canon_path}`"))
    pc = read_any(pat_canon_path)
    if "abstract_len" in pc.columns: pc["abstract_len"] = pd.to_numeric(pc["abstract_len"], errors="coerce")
    if "publication_date_dt" in pc.columns: pc["publication_date_dt"] = pd.to_datetime(pc["publication_date_dt"], errors="coerce")
    pc = basic_profile(pc, "patent_canonical")
    if "publication_number" in pc.columns:
        display(Markdown("**Duplicate publication_number rows (if any):**"))
        dup_frac = pc.duplicated(subset=["publication_number"], keep=False).mean()
        show_frac("duplicated_frac", dup_frac)
    if "abstract_len" in pc.columns and pd.api.types.is_numeric_dtype(pc["abstract_len"]):
        display(Markdown("**Describe numeric: `abstract_len`**")); display(pc["abstract_len"].describe().to_frame("abstract_len"))
    if "publication_date_dt" in pc.columns:
        display(Markdown("**Date coverage for `publication_date_dt`**")); display(date_range(pc, "publication_date_dt"))
else:
    display(Markdown("> ⚠️ `patent_canonical` not found."))

if pat_links_path and pat_links_path.exists():
    display(Markdown(f"**patent_dupe_links:** `{pat_links_path}`"))
    pl = read_any(pat_links_path)
    pl = basic_profile(pl, "patent_dupe_links")
else:
    display(Markdown("> ⚠️ `patent_dupe_links` not found."))

if pat_norm_path and pat_norm_path.exists():
    display(Markdown(f"**patents_normalized:** `{pat_norm_path}`"))
    pn = read_any(pat_norm_path)
    pn = basic_profile(pn, "patents_normalized")
    for c in ["cpc_codes","inventors","application_number","country_code"]:
        if c in pn.columns:
            sizes = pn[c].apply(cell_cardinality)
            display(Markdown(f"Cardinality of `{c}` per patent"))
            display(sizes.value_counts().sort_index().to_frame("count"))
else:
    display(Markdown("> ⚠️ `patents_normalized` not found."))

patent_canonical: C:\Users\gerbe\PycharmProjects\MT\cleantech_data\silver\patents\2025-08-09\patent_canonical.parquet

patent_canonical

Text Only
1
shape: (31364, 18)

Dtypes

dtype
publication_date int64
abstract_len int64
title_len int64
lang_title_score float64
lang_abs_score float64
eng float64
publication_number object
application_number object
country_code object
title object
abstract object
inventor object
cpc_code object
title_clean object
abstract_clean object
lang_title object
lang_abs object
publication_date_dt datetime64[ns]

Top 20 Null Fractions

na_frac
lang_title_score 0.000128
publication_number 0.000000
application_number 0.000000
country_code 0.000000
abstract 0.000000
title 0.000000
inventor 0.000000
cpc_code 0.000000
title_clean 0.000000
publication_date 0.000000
abstract_clean 0.000000
abstract_len 0.000000
publication_date_dt 0.000000
lang_title 0.000000
lang_abs 0.000000
lang_abs_score 0.000000
eng 0.000000
title_len 0.000000

Head(5)

publication_number application_number country_code title abstract publication_date inventor cpc_code title_clean abstract_clean abstract_len publication_date_dt lang_title lang_title_score lang_abs lang_abs_score eng title_len
0 AU-2016208290-B2 AU-2016208290-A AU Closed loop control system for heliostats CLOSED LOOP CONTROL SYSTEM FOR HELIOSTATS \nAbstract \nDescribed herein is a control system for a solar energy collection apparatus (1).... 20220317 [BURTON, ALEXANDER] G05D3/12 Closed loop control system for heliostats CLOSED LOOP CONTROL SYSTEM FOR HELIOSTATS\nAbstract\nDescribed herein is a control system for a solar energy collection apparatus (1). T... 987 2022-03-17 en 0.999495 en 1.0 1.0 41
1 AU-2016321918-B2 AU-2016321918-A AU Device for capturing solar energy The invention relates to a device for capturing solar energy, including a plurality of single-axis solar trackers (S1-S5), each one havi... 20220407 [LÓPEZ ONA, Sergio, ROS RUÍZ, Antonio José] H02S40/34 Device for capturing solar energy The invention relates to a device for capturing solar energy, including a plurality of single-axis solar trackers (S1-S5), each one havi... 827 2022-04-07 en 0.920249 en 1.0 1.0 33
2 AU-2017246326-B2 AU-2017246326-A AU Combined window shade and solar panel A window shade system includes a mounting bracket configured to couple to a structure, a shade tube bracket configured to rotatably coup... 20220526 [GEIGER, JAMES] E06B9/40 Combined window shade and solar panel A window shade system includes a mounting bracket configured to couple to a structure, a shade tube bracket configured to rotatably coup... 506 2022-05-26 en 1.000000 en 1.0 1.0 37
3 AU-2017267740-B2 AU-2017267740-A AU System and methods for improving the accuracy of solar energy and wind energy forecasts for an electric utility grid A computer system and method for improving the accuracy of predictions of the amount of renewable energy, such as solar energy and wind ... 20220324 [FORBES, KEVIN F., ZAMPELLI, ERNEST M.] G06Q50/06 System and methods for improving the accuracy of solar energy and wind energy forecasts for an electric utility grid A computer system and method for improving the accuracy of predictions of the amount of renewable energy, such as solar energy and wind ... 810 2022-03-24 en 1.000000 en 1.0 1.0 116
4 AU-2017276466-B2 AU-2017276466-A AU Ocean carbon capture and storage method and device Provided is an ocean carbon capture and storage (ocean CCS) method and device for performing carbon capture and storage with respect to ... 20220602 [PENG, SIGAN] B01D53/1475 Ocean carbon capture and storage method and device Provided is an ocean carbon capture and storage (ocean CCS) method and device for performing carbon capture and storage with respect to ... 881 2022-06-02 en 1.000000 en 1.0 1.0 50

Duplicate publication_number rows (if any):

duplicated_frac: 0.0000

Describe numeric: abstract_len

abstract_len
count 31364.000000
mean 1177.386143
std 270.049041
min 64.000000
25% 1036.000000
50% 1235.000000
75% 1359.000000
max 3201.000000

Date coverage for publication_date_dt

non_null min max
publication_date_dt 31364 2022-01-01 2024-09-05

patent_dupe_links: C:\Users\gerbe\PycharmProjects\MT\cleantech_data\silver\patents\2025-08-09\patent_dupe_links.parquet

Text Only
1
shape: (375362, 3)

Dtypes

dtype
member_id object
canonical_id object
reason object

Top 20 Null Fractions

na_frac
member_id 0.0
canonical_id 0.0
reason 0.0

Head(5)

member_id canonical_id reason
0 AU-2016208290-A AU-2016208290-A longer
1 AU-2016321918-A AU-2016321918-A longer
2 AU-2016321918-A AU-2016321918-A longer
3 AU-2016321918-A AU-2016321918-A longer
4 AU-2016321918-A AU-2016321918-A longer

patents_normalized: C:\Users\gerbe\PycharmProjects\MT\cleantech_data\silver\patents\2025-08-09\patents_normalized.parquet

patents_normalized

Text Only
1
shape: (31364, 13)

Dtypes

dtype
lang_title_score float64
lang_abs_score float64
publication_number object
cpc_codes object
inventors object
application_number object
country_codes object
country_code object
title_clean object
abstract_clean object
lang_title object
lang_abs object
publication_date_dt datetime64[ns]

Top 20 Null Fractions

na_frac
lang_title_score 0.000128
cpc_codes 0.000000
inventors 0.000000
application_number 0.000000
publication_number 0.000000
country_codes 0.000000
country_code 0.000000
abstract_clean 0.000000
title_clean 0.000000
publication_date_dt 0.000000
lang_title 0.000000
lang_abs 0.000000
lang_abs_score 0.000000

Head(5)

publication_number cpc_codes inventors application_number country_codes country_code title_clean abstract_clean publication_date_dt lang_title lang_title_score lang_abs lang_abs_score
0 AU-2016208290-B2 [F24S2050/25, G05D3/12] [BURTON, ALEXANDER] [AU-2016208290-A] [AU] AU Closed loop control system for heliostats CLOSED LOOP CONTROL SYSTEM FOR HELIOSTATS\nAbstract\nDescribed herein is a control system for a solar energy collection apparatus (1). T... 2022-03-17 en 0.999495 en 1.0
1 AU-2016321918-B2 [F24S2030/11, F24S2030/15, F24S25/12, F24S30/425, H02S20/32, H02S40/34, Y02E10/47, Y02E10/50] [LÓPEZ ONA, Sergio, ROS RUÍZ, Antonio José] [AU-2016321918-A] [AU] AU Device for capturing solar energy The invention relates to a device for capturing solar energy, including a plurality of single-axis solar trackers (S1-S5), each one havi... 2022-04-07 en 0.920249 en 1.0
2 AU-2017246326-B2 [E06B2009/2476, E06B2009/6827, E06B9/40, E06B9/42, E06B9/50, E06B9/68, H02S30/20, Y02E10/50] [GEIGER, JAMES] [AU-2017246326-A] [AU] AU Combined window shade and solar panel A window shade system includes a mounting bracket configured to couple to a structure, a shade tube bracket configured to rotatably coup... 2022-05-26 en 1.000000 en 1.0
3 AU-2017267740-B2 [F05B2260/8211, G01W1/10, G01W2001/006, G01W2201/00, G01W2203/00, G06F16/23, G06F17/10, G06F17/18, G06N20/00, G06Q10/04, G06Q50/06] [FORBES, KEVIN F., ZAMPELLI, ERNEST M.] [AU-2017267740-A] [AU] AU System and methods for improving the accuracy of solar energy and wind energy forecasts for an electric utility grid A computer system and method for improving the accuracy of predictions of the amount of renewable energy, such as solar energy and wind ... 2022-03-24 en 1.000000 en 1.0
4 AU-2017276466-B2 [B01D2252/1035, B01D2257/504, B01D2258/012, B01D2258/0283, B01D2259/4566, B01D53/1412, B01D53/1475, B01D53/18, B01D53/62, Y02A50/20, Y02... [PENG, SIGAN] [AU-2017276466-A] [AU] AU Ocean carbon capture and storage method and device Provided is an ocean carbon capture and storage (ocean CCS) method and device for performing carbon capture and storage with respect to ... 2022-06-02 en 1.000000 en 1.0

Cardinality of cpc_codes per patent

count
cpc_codes
1 9907
2 2793
3 2750
4 2656
5 2619
6 2252
7 1825
8 1450
9 1148
10 877
11 682
12 511
13 391
14 299
15 199
16 226
17 123
18 124
19 99
20 58
21 51
22 45
23 38
24 30
25 37
26 29
27 15
28 24
29 3
30 17
31 8
32 5
33 2
34 17
35 5
36 2
37 1
39 4
40 3
41 5
45 5
46 15
47 2
49 6
52 4
72 2

Cardinality of inventors per patent

count
inventors
0 6989
1 6024
2 3086
3 3437
4 2475
5 2458
6 1866
7 1297
8 981
9 689
10 616
11 359
12 292
13 181
14 148
15 134
16 91
17 50
18 50
19 29
20 40
21 9
22 17
23 9
24 10
25 6
26 4
27 7
28 1
29 1
30 1
34 1
37 1
38 2
44 1
45 1
47 1

Cardinality of application_number per patent

count
application_number
1 31364

Cardinality of country_code per patent

count
country_code
1 31364
Python
# OPENALEX — canonical, M2M, refs (updated columns)
t_canon_path = find_file(openalex_dir, "topics_canonical")
kw_m2m_path  = find_file(openalex_dir, "topic_keywords_m2m")
sib_m2m_path = find_file(openalex_dir, "topic_siblings_m2m")
dom_ref_path = find_file(openalex_dir, "domains_ref")
fld_ref_path = find_file(openalex_dir, "fields_ref")
sub_ref_path = find_file(openalex_dir, "subfields_ref")

topics = None
if t_canon_path and t_canon_path.exists():
    display(Markdown(f"**topics_canonical:** `{t_canon_path}`"))
    topics = read_any(t_canon_path)
    # Updated pipeline uses: display_name, description, works_count, cited_by_count,
    # created_date, updated_date, lang_display_name, lang_description, domain_id/field_id/subfield_id (+ names)
    for c in ["works_count","cited_by_count"]:
        if c in topics.columns:
            topics[c] = pd.to_numeric(topics[c], errors="coerce")
    # Add on-the-fly length columns just for profiling (if you want, optional)
    if "display_name" in topics.columns:
        topics["display_name_len"] = topics["display_name"].astype(str).str.len()
    if "description" in topics.columns:
        topics["description_len"] = topics["description"].astype(str).str.len()

    topics = basic_profile(topics, "topics_canonical")
    for c in ["domain_name","field_name","subfield_name","lang_display_name","lang_description"]:
        if c in topics.columns:
            display(Markdown(f"Top values in `{c}`")); display(topics[c].value_counts().head(15).to_frame("count"))
    for c in ["description_len","display_name_len","works_count","cited_by_count"]:
        if c in topics.columns and pd.api.types.is_numeric_dtype(topics[c]):
            display(Markdown(f"**Describe numeric: `{c}`**")); display(topics[c].describe().to_frame(c))
    # Date coverage (new names)
    for c in ["created_date","updated_date"]:
        if c in topics.columns:
            display(Markdown(f"**Date coverage for `{c}`**")); display(date_range(topics, c))
else:
    display(Markdown("> ⚠️ `topics_canonical` not found."))

if kw_m2m_path and kw_m2m_path.exists():
    display(Markdown(f"**topic_keywords_m2m:** `{kw_m2m_path}`"))
    kw = read_any(kw_m2m_path); kw = basic_profile(kw, "topic_keywords_m2m")
    if topics is not None and "topic_id" in kw.columns and "topic_id" in topics.columns:
        display(Markdown("**FK check** `topic_keywords_m2m.topic_id → topics_canonical.topic_id`"))
        show_frac("unmatched_frac", (~kw["topic_id"].isin(topics["topic_id"])).mean())
else:
    display(Markdown("> ⚠️ `topic_keywords_m2m` not found."))

if sib_m2m_path and sib_m2m_path.exists():
    display(Markdown(f"**topic_siblings_m2m:** `{sib_m2m_path}`"))
    sib = read_any(sib_m2m_path); sib = basic_profile(sib, "topic_siblings_m2m")
    if topics is not None and "topic_id" in topics.columns:
        display(Markdown("**FK checks** (both ends should exist in topics_canonical)"))
        src_unmatched = (~sib["topic_id"].isin(topics["topic_id"])).mean() if "topic_id" in sib.columns else 1.0
        dst_unmatched = (~sib["sibling_topic_id"].isin(topics["topic_id"])).mean() if "sibling_topic_id" in sib.columns else 1.0
        show_frac("edge_src_unmatched_frac", src_unmatched)
        show_frac("edge_dst_unmatched_frac", dst_unmatched)
else:
    display(Markdown("> ⚠️ `topic_siblings_m2m` not found."))

if dom_ref_path and dom_ref_path.exists():
    display(Markdown(f"**domains_ref:** `{dom_ref_path}`"))
    dom = read_any(dom_ref_path); dom = basic_profile(dom, "domains_ref")
    if topics is not None and {"domain_id","id"}.issubset(set(topics.columns) | set(dom.columns)):
        display(Markdown("**Join coverage:** topics.domain_id → domains_ref.id"))
        show_frac("unmatched_frac", (~topics["domain_id"].isin(dom["id"])).mean())
if fld_ref_path and fld_ref_path.exists():
    display(Markdown(f"**fields_ref:** `{fld_ref_path}`"))
    fld = read_any(fld_ref_path); fld = basic_profile(fld, "fields_ref")
    if topics is not None and {"field_id","id"}.issubset(set(topics.columns) | set(fld.columns)):
        display(Markdown("**Join coverage:** topics.field_id → fields_ref.id"))
        show_frac("unmatched_frac", (~topics["field_id"].isin(fld["id"])).mean())
if sub_ref_path and sub_ref_path.exists():
    display(Markdown(f"**subfields_ref:** `{sub_ref_path}`"))
    sub = read_any(sub_ref_path); sub = basic_profile(sub, "subfields_ref")
    if topics is not None and {"subfield_id","id"}.issubset(set(topics.columns) | set(sub.columns)):
        display(Markdown("**Join coverage:** topics.subfield_id → subfields_ref.id"))
        show_frac("unmatched_frac", (~topics["subfield_id"].isin(sub["id"])).mean())

topics_canonical: C:\Users\gerbe\PycharmProjects\MT\cleantech_data\silver\openalex\2025-08-09\topics_canonical.parquet

topics_canonical

Text Only
1
shape: (4516, 17)

Dtypes

dtype
works_count int64
cited_by_count int64
display_name_len int64
description_len int64
topic_id object
display_name object
description object
created_date object
updated_date object
lang_display_name object
lang_description object
domain_id object
domain_name object
field_id object
field_name object
subfield_id object
subfield_name object

Top 20 Null Fractions

na_frac
topic_id 0.0
display_name 0.0
description 0.0
works_count 0.0
cited_by_count 0.0
created_date 0.0
updated_date 0.0
lang_display_name 0.0
lang_description 0.0
domain_id 0.0
domain_name 0.0
field_id 0.0
field_name 0.0
subfield_id 0.0
subfield_name 0.0
display_name_len 0.0
description_len 0.0

Head(5)

topic_id display_name description works_count cited_by_count created_date updated_date lang_display_name lang_description domain_id domain_name field_id field_name subfield_id subfield_name display_name_len description_len
0 T11881 Crystallization and Solubility Studies This cluster of papers focuses on the crystallization processes and control, including topics such as nucleation, solubility, polymorphi... 975416 792291 2024-01-23 2025-08-04T05:30:13.285909 en en 3 Physical Sciences 25 Materials Science 2505 Materials Chemistry 38 308
1 T11475 French Urban and Social Studies This cluster of papers explores the intersection of territorial governance, environmental participation, and sustainable development. It... 656763 703244 2024-01-23 2025-08-04T05:21:24.303694 en en 2 Social Sciences 33 Social Sciences 3312 Sociology and Political Science 31 412
2 T13445 American Constitutional Law and Politics This cluster of papers explores the development of American political thought and history, focusing on topics such as the American found... 467988 1823982 2024-01-23 2025-08-04T05:26:43.888907 en en 2 Social Sciences 33 Social Sciences 3320 Political Science and International Relations 40 327
3 T10165 Classical Antiquity Studies This cluster of papers explores various aspects of ancient Mediterranean civilizations, including the society, economy, literature, and ... 395325 1639281 2024-01-23 2025-08-04T05:20:01.858253 en en 2 Social Sciences 33 Social Sciences 3314 Anthropology 27 354
4 T10362 Biblical Studies and Interpretation This cluster of papers focuses on biblical studies, ancient Judaism, early Christianity, and Christian theology. It explores topics such... 343175 1059942 2024-01-23 2025-08-04T06:01:56.990280 en en 2 Social Sciences 12 Arts and Humanities 1212 Religious studies 35 320

Top values in domain_name

count
domain_name
Physical Sciences 1571
Social Sciences 1487
Health Sciences 844
Life Sciences 614

Top values in field_name

count
field_name
Social Sciences 764
Medicine 692
Engineering 560
Computer Science 302
Arts and Humanities 266
Biochemistry, Genetics and Molecular Biology 248
Agricultural and Biological Sciences 235
Environmental Science 202
Business, Management and Accounting 146
Psychology 144
Materials Science 123
Health Professions 107
Economics, Econometrics and Finance 107
Physics and Astronomy 104
Chemistry 101

Top values in subfield_name

count
subfield_name
Sociology and Political Science 224
Education 137
Molecular Biology 135
Political Science and International Relations 118
Electrical and Electronic Engineering 117
Surgery 96
Plant Science 90
Economics and Econometrics 79
Mechanical Engineering 79
Information Systems 78
Artificial Intelligence 77
Biomedical Engineering 72
Materials Chemistry 69
Organic Chemistry 61
Public Health, Environmental and Occupational Health 54

Top values in lang_display_name

count
lang_display_name
en 4516

Top values in lang_description

count
lang_description
en 4516

Describe numeric: description_len

description_len
count 4516.000000
mean 383.544066
std 72.346466
min 192.000000
25% 330.750000
50% 378.000000
75% 430.000000
max 828.000000

Describe numeric: display_name_len

display_name_len
count 4516.000000
mean 36.277015
std 6.990049
min 14.000000
25% 31.000000
50% 36.000000
75% 40.000000
max 83.000000

Describe numeric: works_count

works_count
count 4516.000000
mean 49773.929805
std 46035.488858
min 69.000000
25% 20376.750000
50% 38681.500000
75% 67517.250000
max 975416.000000

Describe numeric: cited_by_count

cited_by_count
count 4.516000e+03
mean 5.897473e+05
std 7.457060e+05
min 6.000000e+01
25% 9.836400e+04
50% 3.315000e+05
75% 7.917502e+05
max 8.149172e+06

Date coverage for created_date

non_null min max
created_date 4516 2024-01-23 2024-01-23

Date coverage for updated_date

non_null min max
updated_date 4516 2024-03-04 04:59:34.287941 2025-08-04 06:06:45.191053

topic_keywords_m2m: C:\Users\gerbe\PycharmProjects\MT\cleantech_data\silver\openalex\2025-08-09\topic_keywords_m2m.parquet

topic_keywords_m2m

Text Only
1
shape: (45154, 3)

Dtypes

dtype
topic_id object
keyword_norm object
keyword_orig object

Top 20 Null Fractions

na_frac
topic_id 0.0
keyword_norm 0.0
keyword_orig 0.0

Head(5)

topic_id keyword_norm keyword_orig
0 T11881 crystallization Crystallization
1 T11881 nucleation Nucleation
2 T11881 solubility Solubility
3 T11881 polymorphism Polymorphism
4 T11881 ultrasound_assisted_crystallization Ultrasound-Assisted Crystallization

FK check topic_keywords_m2m.topic_id → topics_canonical.topic_id

unmatched_frac: 0.0000

topic_siblings_m2m: C:\Users\gerbe\PycharmProjects\MT\cleantech_data\silver\openalex\2025-08-09\topic_siblings_m2m.parquet

topic_siblings_m2m

Text Only
1
shape: (243006, 2)

Dtypes

dtype
topic_id object
sibling_topic_id object

Top 20 Null Fractions

na_frac
topic_id 0.0
sibling_topic_id 0.0

Head(5)

topic_id sibling_topic_id
0 T11881 T10275
1 T11881 T13889
2 T11881 T12302
3 T11881 T10440
4 T11881 T10311

FK checks (both ends should exist in topics_canonical)

edge_src_unmatched_frac: 0.0000

edge_dst_unmatched_frac: 0.0000

domains_ref: C:\Users\gerbe\PycharmProjects\MT\cleantech_data\silver\openalex\2025-08-09\domains_ref.parquet

domains_ref

Text Only
1
shape: (4, 2)

Dtypes

dtype
id object
name object

Top 20 Null Fractions

na_frac
id 0.0
name 0.0

Head(5)

id name
0 1 Life Sciences
1 2 Social Sciences
2 3 Physical Sciences
3 4 Health Sciences

Join coverage: topics.domain_id → domains_ref.id

unmatched_frac: 0.0000

fields_ref: C:\Users\gerbe\PycharmProjects\MT\cleantech_data\silver\openalex\2025-08-09\fields_ref.parquet

fields_ref

Text Only
1
shape: (26, 2)

Dtypes

dtype
id object
name object

Top 20 Null Fractions

na_frac
id 0.0
name 0.0

Head(5)

id name
0 11 Agricultural and Biological Sciences
1 12 Arts and Humanities
2 13 Biochemistry, Genetics and Molecular Biology
3 14 Business, Management and Accounting
4 15 Chemical Engineering

Join coverage: topics.field_id → fields_ref.id

unmatched_frac: 0.0000

subfields_ref: C:\Users\gerbe\PycharmProjects\MT\cleantech_data\silver\openalex\2025-08-09\subfields_ref.parquet

subfields_ref

Text Only
1
shape: (252, 2)

Dtypes

dtype
id object
name object

Top 20 Null Fractions

na_frac
id 0.0
name 0.0

Head(5)

id name
0 1100 General Agricultural and Biological Sciences
1 1102 Agronomy and Crop Science
2 1103 Animal Science and Zoology
3 1104 Aquatic Science
4 1105 Ecology, Evolution, Behavior and Systematics

Join coverage: topics.subfield_id → subfields_ref.id

unmatched_frac: 0.0000

Unify → silver/unified/unified_docs.parquet

Rules (unchanged, adapted to updated column names where necessary):

  • Media (from media_canonical)
  • doc_id = "media:" + sha1(url_key) (fallback to url_clean/url if url_key is missing)
  • title = title_clean
  • text = content_clean
  • date = published_dt
  • lang = lang_content if present else lang_title
  • url = url_clean
  • source = "kaggle.media"

  • Patents (from patents_normalized)

  • doc_id = "patent:" + publication_number
  • title = title_clean
  • text = abstract_clean
  • date = publication_date_dt
  • lang = lang_abs if present else lang_title
  • country = country_code
  • cpc_codes = list[str]
  • source = "kaggle.patent"

  • Topics (from topics_canonical)

  • doc_id = "topic:" + topic_id
  • title = display_name
  • text = description
  • date = updated_date or created_date
  • lang = lang_description or lang_display_name
  • url = topic_url (if present; else NA)
  • source = "openalex.topics"
Python
# ------------------------------ unify helpers ------------------------------

def _sha1_hex(text: str) -> str:
    return hashlib.sha1((text or "").encode("utf-8", "ignore")).hexdigest()

def _to_datetime(s: pd.Series) -> pd.Series:
    return pd.to_datetime(s, errors="coerce")

def _prefer(*vals):
    for v in vals:
        if v is not None and pd.notna(v) and str(v).strip() != "":
            return v
    return None

def _to_list_str(val: Any) -> list[str]:
    if val is None:
        return []
    try:
        if pd.isna(val):
            return []
    except Exception:
        pass
    if isinstance(val, (list, tuple, set)):
        return [str(x) for x in val if str(x).strip()]
    if isinstance(val, str):
        s = val.strip()
        if len(s) >= 2 and s[0] == "[" and s[-1] == "]":
            try:
                parsed = ast.literal_eval(s)
                if isinstance(parsed, (list, tuple, set)):
                    return [str(x) for x in parsed if str(x).strip()]
            except Exception:
                return [s] if s else []
        return [s] if s else []
    return [str(val)] if str(val).strip() else []

def _finalize_unified(df: pd.DataFrame) -> pd.DataFrame:
    # enforce dtypes
    out = df.copy()
    out["doc_type"] = out["doc_type"].astype("category")
    out["date"] = pd.to_datetime(out["date"], errors="coerce")
    # Normalize missing URL/country
    if "url" in out.columns:
        out["url"] = out["url"].astype("string")
    if "country" in out.columns:
        out["country"] = out["country"].astype("string")
    # cpc_codes must be a list[str]
    if "cpc_codes" in out.columns:
        out["cpc_codes"] = out["cpc_codes"].map(_to_list_str)
    # essential fields
    essential = out["doc_id"].notna() & out["title"].notna() & out["text"].notna() & out["date"].notna()
    dropped = len(out) - int(essential.sum())
    if dropped:
        print(f"[unify] dropping {dropped} rows missing essential fields (doc_id/title/text/date)")
    out = out[essential].copy()
    out = out.drop_duplicates(subset=["doc_id"], keep="first")
    out = out.sort_values(["doc_type","doc_id"]).reset_index(drop=True)
    return out

# ------------------------------ unify build ------------------------------

unified_rows: list[dict] = []

# Media view
if 'media' in globals() and isinstance(media, pd.DataFrame) and not media.empty:
    m = media.copy()
    # prefer url_key; fallback to url_clean/url
    def _m_doc_id(row):
        key = (row.get("url_key") or row.get("url_clean") or row.get("url") or "").strip()
        return "media:" + _sha1_hex(key) if key else None

    mv = pd.DataFrame({
        "doc_id": m.apply(_m_doc_id, axis=1),
        "doc_type": "media",
        "title": m.get("title_clean", m.get("title", pd.Series(index=m.index))).astype(str),
        "text":  m.get("content_clean", m.get("content", pd.Series(index=m.index))).astype(str),
        "date":  _to_datetime(m.get("published_dt", pd.Series(pd.NaT, index=m.index))),
        "lang":  (m.get("lang_content") if "lang_content" in m.columns else m.get("lang_title")).fillna("unknown").astype(str),
        "source": "kaggle.media",
        "url":   m.get("url_clean", m.get("url", pd.Series(index=m.index))).astype(str),
        "country": pd.Series([None]*len(m), index=m.index, dtype="object"),
        "cpc_codes": pd.Series([[]]*len(m), index=m.index, dtype="object"),
    })
    unified_rows.append(mv)

# Patents view (from normalized)
if 'pn' in globals() and isinstance(pn, pd.DataFrame) and not pn.empty:
    p = pn.copy()
    pv = pd.DataFrame({
        "doc_id": "patent:" + p["publication_number"].astype(str),
        "doc_type": "patent",
        "title": p.get("title_clean", pd.Series(index=p.index)).astype(str),
        "text":  p.get("abstract_clean", pd.Series(index=p.index)).astype(str),
        "date":  _to_datetime(p.get("publication_date_dt", pd.Series(pd.NaT, index=p.index))),
        "lang":  p.get("lang_abs", p.get("lang_title", pd.Series(index=p.index))).fillna("unknown").astype(str),
        "source": "kaggle.patent",
        "url":   pd.Series([None]*len(p), index=p.index, dtype="object"),
        "country": p.get("country_code", pd.Series(index=p.index)).astype("string"),
        "cpc_codes": p.get("cpc_codes", pd.Series([[]]*len(p), index=p.index, dtype="object")).map(_to_list_str),
    })
    unified_rows.append(pv)

# Topics view (updated column names)
if 'topics' in globals() and isinstance(topics, pd.DataFrame) and not topics.empty:
    t = topics.copy()
    date_series = _to_datetime(t.get("updated_date", pd.Series(pd.NaT, index=t.index)))
    if date_series.isna().all():
        date_series = _to_datetime(t.get("created_date", pd.Series(pd.NaT, index=t.index)))
    lang_series = t.get("lang_description", t.get("lang_display_name", pd.Series(index=t.index))).fillna("unknown").astype(str)
    tv = pd.DataFrame({
        "doc_id": "topic:" + t["topic_id"].astype(str),
        "doc_type": "topic",
        "title": t.get("display_name", pd.Series(index=t.index)).astype(str),
        "text":  t.get("description", pd.Series(index=t.index)).astype(str),
        "date":  date_series,
        "lang":  lang_series,
        "source": "openalex.topics",
        "url":   t.get("topic_url", pd.Series([None]*len(t), index=t.index, dtype="object")).astype("string"),
        "country": pd.Series([None]*len(t), index=t.index, dtype="object"),
        "cpc_codes": pd.Series([[]]*len(t), index=t.index, dtype="object"),
    })
    unified_rows.append(tv)

if not unified_rows:
    raise RuntimeError("No input frames were loaded; cannot build unified docs. Check earlier validation steps.")

unified_df = pd.concat(unified_rows, ignore_index=True)
unified_df = _finalize_unified(unified_df)

display(Markdown("### Unified sample"))
display(unified_df.head(8))
display(Markdown("**Counts by doc_type**")); display(unified_df["doc_type"].value_counts().to_frame("count"))
display(Markdown("**Top languages**")); display(unified_df["lang"].value_counts().head(12).to_frame("count"))
display(Markdown("**Date coverage**")); display(unified_df.groupby("doc_type")["date"].agg(["min","max","count"]))

# Write with explicit Arrow schema (cpc_codes as list<string>), fallback to pandas parquet
unified_dir = root / "silver" / "unified"
unified_dir.mkdir(parents=True, exist_ok=True)
pq_path = unified_dir / "unified_docs.parquet"

try:
    import pyarrow as pa, pyarrow.parquet as pq
    # Build schema; allow None for url/country
    schema = pa.schema([
        pa.field("doc_id", pa.string()),
        pa.field("doc_type", pa.string()),
        pa.field("title", pa.string()),
        pa.field("text", pa.string()),
        pa.field("date", pa.timestamp("ns")),
        pa.field("lang", pa.string()),
        pa.field("source", pa.string()),
        pa.field("url", pa.string()),
        pa.field("country", pa.string()),
        pa.field("cpc_codes", pa.list_(pa.string())),
    ])
    # Ensure list[str]
    udf = unified_df.copy()
    udf["cpc_codes"] = udf["cpc_codes"].map(_to_list_str)
    table = pa.Table.from_pandas(udf[list(schema.names)], schema=schema, preserve_index=False)
    pq.write_table(table, pq_path)
    print(f"[unify] Wrote Arrow-parquet with explicit schema → {pq_path}")
except Exception as exc:
    print(f"[unify] Arrow write failed ({type(exc).__name__}: {exc}); falling back to pandas.to_parquet")
    unified_df.to_parquet(pq_path, index=False)
    print(f"[unify] Wrote pandas parquet → {pq_path}")

# quick sanity echo
print("Rows:", len(unified_df), "| Path:", pq_path)
Text Only
1
2
C:\Users\gerbe\AppData\Local\Temp\ipykernel_66220\2918688680.py:127: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.
  unified_df = pd.concat(unified_rows, ignore_index=True)

Unified sample

doc_id doc_type title text date lang source url country cpc_codes
0 media:000999be09894ed5b098ed528fe55b30e3e26054 media JinkoSolar and Marubeni Deliver 6MWh of SunTera ESS to Kitakyushu, Japan, Enhancing Efficient and Safe Energy Management – pv magazine International Recently, JinkoSolar has successfully signed a supply agreement with Marubeni Corporation for two 3MWh SunTera energy storage systems, providing a total of ... 2024-10-23 en kaggle.media https://www.pv-magazine.com/press-releases/jinkosolar-and-marubeni-deliver-6mwh-of-suntera-ess-to-kitakyushu-japan-enhancing-efficient-and-safe-energy-manag... <NA> []
1 media:000d800574212a3f7263a0aa491e38d0fbad660f media Combustion Equipment ( Energy Management) Videos The GCU evo ( Gas Combustion Unit) developed by SAACKE makes the safe transport of liquefied natural gas ( LNG) at sea possible. It utilises surplus boil-of... 2023-07-26 en kaggle.media https://www.energy-xprt.com/energy-management/combustion-equipment/videos <NA> []
2 media:0016fb2f14828861db0b2f07275e119dad2d44c3 media Wood appointed owner's engineer for UK’ s first commercial lithium refinery Mineral processing company Green Lithium has appointed Wood as its owner’ s engineer, as it progresses plans to build and operate the UK’ s first large-scal... 2022-01-24 en kaggle.media https://www.energyvoice.com/renewables-energy-transition/381915/wood-appointed-owners-engineer-for-uks-first-commercial-lithium-refinery/ <NA> []
3 media:0016fc189cc7a6b4b1b6992dd511ba6ef7710564 media Energy Technology Perspectives 2020 – Analysis or subcribe to a paid service.\nTo avoid the worst consequences of climate change, the global energy system must rapidly reduce its emissions. Calls to redu... 2024-02-03 en kaggle.media https://www.iea.org/reports/energy-technology-perspectives-2020 <NA> []
4 media:0017c5e566b4104ef39b4fe93b8068b870644e2a media 'Wartime footing ' upgrade needed as grid stalls offshore wind plans, warns UK champion The UK was warned that availability of grid connections is emerging as the number-one obstacle to its offshore wind ambitions, as the government’ s own ‘ ch... 2023-04-05 en kaggle.media https://www.rechargenews.com/news/2-1-1431535 <NA> []
5 media:0018edf9110251ce52dd8bfd30f7b6dc011e9f62 media Hailo Wind Systems Gmbh & Co. Kg Profile The wind industry is characterised by its great innovative force, requiring a high degree of reliability, flexibility, and perseverance. Only those companie... 2023-06-14 en kaggle.media https://www.energy-xprt.com/companies/hailo-wind-systems-gmbh-co-kg-125537 <NA> []
6 media:001c7395ac57425f1bb0440cc36d2e6bccdd4858 media New method to avoid overvoltage in vertical PV linked to low-voltage grids – pv magazine International A research group in Finland has created a new workflow to prevent voltage rises in bifacial vertical PV systems located at high latitudes. According to thei... 2023-07-03 en kaggle.media https://www.pv-magazine.com/2023/07/03/new-method-to-avoid-overvoltage-in-vertical-pv-linked-to-low-voltage-grids/ <NA> []
7 media:00241ce5830570c00ee65588e48c5430bdbbb492 media SolarEV City Concept Could Be Viable in High-Latitude Cities Like Paris By continuing to browse this site you agree to our use of cookies. More info.\nThe iconic city of Paris is synonymous with climate change, thanks in part to... 2023-09-19 en kaggle.media https://www.azocleantech.com/news.aspx?newsID=34010 <NA> []

Counts by doc_type

count
doc_type
patent 31364
media 19341
topic 4516

Top languages

count
lang
en 52729
zh 2210
ko 133
fr 91
de 27
ja 7
pl 6
ar 3
ru 3
pt 2
cs 2
la 2

Date coverage

Text Only
1
2
C:\Users\gerbe\AppData\Local\Temp\ipykernel_66220\2918688680.py:134: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  display(Markdown("**Date coverage**")); display(unified_df.groupby("doc_type")["date"].agg(["min","max","count"]))
min max count
doc_type
media 2022-01-02 00:00:00.000000 2024-10-24 00:00:00.000000 19341
patent 2022-01-01 00:00:00.000000 2024-09-05 00:00:00.000000 31364
topic 2024-03-04 04:59:34.287941 2025-08-04 06:06:45.191053 4516
Text Only
1
2
[unify] Wrote Arrow-parquet with explicit schema → C:\Users\gerbe\PycharmProjects\MT\cleantech_data\silver\unified\unified_docs.parquet
Rows: 55221 | Path: C:\Users\gerbe\PycharmProjects\MT\cleantech_data\silver\unified\unified_docs.parquet