Cleantech Silver Validation + Unification (Latest Buckets Version 2025-08-09)¶

This notebook reads the latest Silver buckets for: - media → media_canonical, media_dupe_links - patents → patent_canonical, patent_dupe_links, patents_normalized - openalex → topics_canonical, topic_keywords_m2m, topic_siblings_m2m, domains_ref, fields_ref, subfields_ref

Then it unifies the three sources into a single Parquet file at:

Text Only
1	`cleantech_data/silver/unified/unified_docs.parquet`

Minimal unified schema¶

Text Only
1	`doc_id, doc_type, title, text, date, lang, source, url, country, cpc_codes`

- doc_type ∈ {"media","patent","topic"} - date is UTC-naive datetime64[ns] - lang is ISO‑639‑1 where available, else "unknown" - country only for patents (2-letter code), else NA - cpc_codes is a list (empty list for non‑patents)

Notes: - The notebook tolerates either Parquet or CSV.GZ Silver outputs (because your safe_write may fall back to CSV). - It adapts to the updated OpenAlex topic structure (created_date / updated_date, lang_description, lang_display_name).

Python
from __future__ import annotations

import os, re, ast, json, hashlib
from pathlib import Path
from datetime import datetime
from typing import Optional, Iterable, Any, List, Dict

import pandas as pd

try:
    from IPython.display import display, Markdown
    _HAS_RICH = True
except Exception:
    _HAS_RICH = False
    def display(x): print(x)
    def Markdown(x): return x

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 160)
pd.set_option("display.max_colwidth", 160)

# ---------------------------- small IO helpers ----------------------------

def find_file(base_dir: str | Path, stem: str) -> Optional[Path]:
    base = Path(base_dir)
    if not base.exists():
        return None
    exts = (".parquet", ".pq", ".csv.gz", ".csv")
    for ext in exts:
        p = base / f"{stem}{ext}"
        if p.exists():
            return p
    cands = []
    for ext in exts:
        cands += list(base.glob(f"{stem}*{ext}"))
    if not cands:
        return None
    def _score(p: Path) -> tuple[int, str]:
        suf = p.suffix.lower()
        if suf in (".parquet", ".pq"): return (0, p.name)
        if suf == ".gz": return (1, p.name)
        return (2, p.name)
    cands.sort(key=_score)
    return cands[0]

def _read_parquet(path: Path, n_rows: Optional[int]) -> pd.DataFrame:
    df = pd.read_parquet(path)
    return df.head(n_rows) if n_rows else df

def _read_csv_like(path: Path, n_rows: Optional[int]) -> pd.DataFrame:
    if path.suffix.lower() == ".gz" or path.name.endswith(".csv.gz"):
        return pd.read_csv(path, compression="infer", nrows=n_rows, low_memory=False)
    return pd.read_csv(path, nrows=n_rows, low_memory=False)

def read_any(path: str | Path, n_rows: Optional[int] = None) -> pd.DataFrame:
    p = Path(path)
    suf = p.suffix.lower()
    if suf in (".parquet", ".pq"):
        try: return _read_parquet(p, n_rows)
        except Exception: return _read_csv_like(p, n_rows)
    if suf in (".csv", ".gz"):
        try: return _read_csv_like(p, n_rows)
        except Exception: return _read_parquet(p, n_rows)
    try: return _read_parquet(p, n_rows)
    except Exception: return _read_csv_like(p, n_rows)

# ------------------------- profiling display utils ------------------------

def _maybe_drop_header_row(df: pd.DataFrame):
    if df.empty: return df, False
    try:
        first = df.iloc[0].astype(str).tolist()
        cols  = [str(c) for c in df.columns.tolist()]
        if first == cols:
            return df.iloc[1:].reset_index(drop=True), True
    except Exception: pass
    return df, False

def _dtypes_table(df: pd.DataFrame) -> pd.DataFrame:
    s = df.dtypes.astype(str)
    out = s.to_frame(name="dtype")
    order = {"bool":0,"boolean":0,"Int64":1,"int64":1,"Float64":2,"float64":2,"string":3,"object":4,"datetime64[ns]":5,"category":6}
    rank = out["dtype"].map(lambda t: order.get(t, 99))
    return out.assign(_rank=rank).sort_values(["_rank","dtype"]).drop(columns="_rank")

def _show_df(df: pd.DataFrame, n: int = 5):
    if _HAS_RICH: display(df.head(n))
    else: print(df.head(n).to_string())

def basic_profile(df: pd.DataFrame, name: str = "", head_n: int = 5) -> pd.DataFrame:
    if name: display(Markdown(f"### {name}"))
    print("shape:", df.shape)
    df2, dropped = _maybe_drop_header_row(df)
    if dropped: print("[fix] Dropped first row that duplicated the header names.")
    display(Markdown("**Dtypes**")); display(_dtypes_table(df2))
    display(Markdown("**Top 20 Null Fractions**"))
    na = df2.isna().mean().sort_values(ascending=False).to_frame("na_frac")
    _show_df(na, n=min(20, len(na)))
    display(Markdown(f"**Head({head_n})**"))
    with pd.option_context("display.max_colwidth", 140):
        _show_df(df2, n=head_n)
    return df2

def bar_top(df: pd.DataFrame, col: str, n: int = 20, title: Optional[str] = None):
    try:
        import matplotlib.pyplot as plt
        vc = df[col].value_counts(dropna=False).head(n)
        plt.figure(figsize=(10,4)); vc.plot(kind="bar"); plt.xticks(rotation=45, ha="right")
        if title: plt.title(title)
        plt.tight_layout(); plt.show()
    except Exception:
        display((df[col].value_counts(dropna=False).head(n)).to_frame("count"))

def date_range(df: pd.DataFrame, col: str) -> pd.DataFrame:
    if col not in df.columns:
        return pd.DataFrame({"non_null":[0], "min":[pd.NaT], "max":[pd.NaT]}, index=[col])
    s = pd.to_datetime(df[col], errors="coerce")
    return pd.DataFrame({"non_null":[int(s.notna().sum())], "min":[s.min()], "max":[s.max()]}, index=[col])

# Pretty coverage helper
def show_frac(label: str, frac) -> None:
    try:
        v = float(frac)
    except Exception:
        v = float("nan")
    display(Markdown(f"**{label}:** `{v:.4f}`"))

# -------------------------- environment discovery -------------------------

_DATE_RE = re.compile(r"^\d{4}-\d{2}-\d{2}$")

def _repo_root_from_cwd() -> Path:
    p = Path.cwd()
    for q in [p] + list(p.parents):
        if (q / ".git").exists() or (q / "pyproject.toml").exists():
            return q
    return p

def discover_data_root() -> Path:
    env = os.getenv("CLEANTECH_DATA_DIR")
    if env and Path(env).exists(): return Path(env).resolve()
    repo = _repo_root_from_cwd()
    preferred = (repo / "cleantech_data").resolve()
    if preferred.exists(): return preferred
    alt = (repo / "notebooks" / "cleantech_data").resolve()
    if alt.exists(): return alt
    return (Path.cwd() / "cleantech_data").resolve()

def latest_bucket(base: Path) -> Path:
    if not base.exists(): return base
    dated, others = [], []
    for d in base.iterdir():
        if d.is_dir():
            try:
                dt = datetime.strptime(d.name, "%Y-%m-%d").date()
                dated.append((dt, d))
            except Exception:
                others.append(d)
    if dated:
        dated.sort(key=lambda x: x[0])
        return dated[-1][1]
    if others:
        return max(others, key=lambda p: p.stat().st_mtime)
    return base

def silver_bucket(dataset: str, root: Path) -> Path:
    return latest_bucket(root / "silver" / dataset)

def list_dir_files(p: Path) -> pd.DataFrame:
    rows = []
    if p.exists():
        for f in p.iterdir():
            if f.is_file():
                try:
                    rows.append({"file": f.name, "bytes": f.stat().st_size, "modified": datetime.fromtimestamp(f.stat().st_mtime)})
                except Exception:
                    rows.append({"file": f.name, "bytes": None, "modified": None})
    df = pd.DataFrame(rows)
    if df.empty: return df
    if "modified" in df.columns:
        df = df.sort_values("modified", ascending=False)
    if "bytes" in df.columns:
        def _fmt(n): 
            if n is None: return None
            x = float(n)
            for u in ["B","KB","MB","GB","TB"]:
                if x < 1024: return f"{x:.1f} {u}"
                x /= 1024
            return f"{x:.1f} PB"
        df["size"] = df["bytes"].map(_fmt)
    return df

# Media dtype coercion (bool/Int typed cleanly)
def coerce_media_dtypes(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if "is_listing" in df.columns:
        def _to_bool(x):
            if pd.isna(x): return pd.NA
            s = str(x).strip().lower()
            if s in {"true","1","yes"}: return True
            if s in {"false","0","no"}: return False
            return pd.NA
        df["is_listing"] = df["is_listing"].map(_to_bool).astype("boolean")
    for c in ("content_char_count","content_word_count"):
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce").astype("Int64")
    if "published_dt" in df.columns:
        df["published_dt"] = pd.to_datetime(df["published_dt"], errors="coerce")
    return df

# Robust list cardinality (patents etc.)
def cell_cardinality(x) -> int:
    if x is None:
        return 0
    try:
        is_na = pd.isna(x)
        if isinstance(is_na, (bool, type(pd.NA))):
            if bool(is_na): return 0
    except Exception:
        pass
    if isinstance(x, (list, tuple, set)):
        return len(x)
    try:
        import numpy as np
        if isinstance(x, (np.ndarray, pd.api.extensions.ExtensionArray)):
            return int(getattr(x, "size", len(x)))
    except Exception:
        pass
    if isinstance(x, str):
        s = x.strip()
        if len(s) >= 2 and s[0] == "[" and s[-1] == "]":
            try:
                parsed = ast.literal_eval(s)
                if isinstance(parsed, (list, tuple, set)):
                    return len(parsed)
            except Exception:
                pass
        return 0 if s == "" else 1
    return 1

Python
root = discover_data_root()
display(Markdown(f"**Data root (chosen):** `{root}`"))

media_dir    = silver_bucket("media", root)
patents_dir  = silver_bucket("patents", root)
openalex_dir = silver_bucket("openalex", root)

display(Markdown("**Latest Silver buckets**"))
display({"media": str(media_dir), "patents": str(patents_dir), "openalex": str(openalex_dir)})

display(Markdown("#### Files in Media bucket"));   display(list_dir_files(media_dir))
display(Markdown("#### Files in Patents bucket")); display(list_dir_files(patents_dir))
display(Markdown("#### Files in OpenAlex bucket"));display(list_dir_files(openalex_dir))

Data root (chosen): C:\Users\gerbe\PycharmProjects\MT\cleantech_data

Latest Silver buckets

Text Only
{'media': 'C:\\Users\\gerbe\\PycharmProjects\\MT\\cleantech_data\\silver\\media\\2025-08-09',
 'patents': 'C:\\Users\\gerbe\\PycharmProjects\\MT\\cleantech_data\\silver\\patents\\2025-08-09',
 'openalex': 'C:\\Users\\gerbe\\PycharmProjects\\MT\\cleantech_data\\silver\\openalex\\2025-08-09'}

Files in Media bucket¶

	file	bytes	modified	size
1	media_dupe_links.parquet	13711	2025-08-24 17:35:16.331749	13.4 KB
0	media_canonical.parquet	147587328	2025-08-24 17:35:16.295887	140.8 MB
2	media_excluded_non_articles.parquet	2690049	2025-08-24 17:11:55.436329	2.6 MB

Files in Patents bucket¶

	file	bytes	modified	size
0	patents_normalized.parquet	17524884	2025-08-24 12:22:59.683251	16.7 MB
2	patent_dupe_links.parquet	745944	2025-08-24 12:22:59.338741	728.5 KB
1	patent_canonical.parquet	33063620	2025-08-24 12:22:59.126081	31.5 MB

Files in OpenAlex bucket¶

	file	bytes	modified	size
2	subfields_ref.parquet	6953	2025-08-24 12:30:08.482854	6.8 KB
1	fields_ref.parquet	2249	2025-08-24 12:30:08.478860	2.2 KB
0	domains_ref.parquet	1649	2025-08-24 12:30:08.475859	1.6 KB
5	topic_siblings_m2m.parquet	439273	2025-08-24 12:30:08.469853	429.0 KB
4	topic_keywords_m2m.parquet	943805	2025-08-24 12:30:08.417989	921.7 KB
3	topics_canonical.parquet	1109773	2025-08-24 12:30:08.388933	1.1 MB

Python
# MEDIA — canonical & duplicate links
media_canon_path = find_file(media_dir, "media_canonical")
media_links_path = find_file(media_dir, "media_dupe_links")

if media_canon_path and media_canon_path.exists():
    display(Markdown(f"**media_canonical:** `{media_canon_path}`"))
    media = read_any(media_canon_path)
    media = coerce_media_dtypes(media)
    media = basic_profile(media, "media_canonical")
    for c in ["content_char_count","content_word_count"]:
        if c in media.columns and pd.api.types.is_numeric_dtype(media[c]):
            display(Markdown(f"**Describe numeric: `{c}`**")); display(media[c].describe().to_frame(c))
    for c in ["domain","lang_title","lang_content"]:
        if c in media.columns:
            display(Markdown(f"Top values in `{c}`")); display(media[c].value_counts().head(15).to_frame("count"))
    if "published_dt" in media.columns:
        display(Markdown("**Date coverage for `published_dt`**")); display(date_range(media, "published_dt"))
else:
    display(Markdown("> ⚠️ `media_canonical` not found."))

if media_links_path and media_links_path.exists():
    display(Markdown(f"**media_dupe_links:** `{media_links_path}`"))
    mlinks = read_any(media_links_path)
    mlinks = basic_profile(mlinks, "media_dupe_links")
    if "reason" in mlinks.columns:
        display(Markdown("**Duplicate reasons (media_dupe_links)**"))
        display(mlinks["reason"].value_counts().to_frame("count"))
else:
    display(Markdown("> ⚠️ `media_dupe_links` not found."))

media_canonical: C:\Users\gerbe\PycharmProjects\MT\cleantech_data\silver\media\2025-08-09\media_canonical.parquet

media_canonical¶

Text Only
shape: (19341, 24)

Dtypes

	dtype
is_article_like	bool
is_listing	boolean
content_char_count	Int64
content_word_count	Int64
author	float64
lang_title_score	float64
lang_content_score	float64
title	object
date	object
content	object
domain	object
url	object
title_clean	object
content_rawtext	object
content_clean	object
lang_title	object
lang_content	object
content_sha1	object
title_fp	object
url_key	object
non_article_reason	object
_root	object
url_clean	object
published_dt	datetime64[ns]

Top 20 Null Fractions

	na_frac
author	1.0
title	0.0
date	0.0
content	0.0
domain	0.0
url	0.0
is_listing	0.0
title_clean	0.0
content_rawtext	0.0
content_clean	0.0
lang_title	0.0
lang_content	0.0
lang_title_score	0.0
lang_content_score	0.0
content_char_count	0.0
content_word_count	0.0
content_sha1	0.0
title_fp	0.0
published_dt	0.0
url_key	0.0

Head(5)

	title	date	author	content	domain	url	is_listing	title_clean	content_rawtext	content_clean	lang_title	lang_content	lang_title_score	lang_content_score	content_char_count	content_word_count	content_sha1	title_fp	published_dt	url_key	is_article_like	_root	url_clean
0	Zambia set for 60 MW/20 MWh of solar, storage – pv magazine International	2024-05-06	NaN	['Turkish developer YEO and Zambian sustainable energy company are constructing a 60 MW solar plant with a 20 MWh battery energy storage...	pv-magazine.com	http://pv-magazine.com/2024/05/06/zambia-set-for-60-mw-20-mwh-of-solar-storage/	False	Zambia set for 60 MW/20 MWh of solar, storage – pv magazine International	['Turkish developer YEO and Zambian sustainable energy company are constructing a 60 MW solar plant with a 20 MWh battery energy storage...	Turkish developer YEO and Zambian sustainable energy company are constructing a 60 MW solar plant with a 20 MWh battery energy storage s...	en	en	0.999994	1.0	3602	589	5eca9aa2862632d015697f5fa0a67536e4490d93	20\|60\|international\|magazine\|mw\|mwh\|pv\|set\|solar\|storage\|zambia	2024-05-06	pv-magazine.com/2024/05/06/zambia-set-for-60-mw-20-mwh-of-solar-storage	True	http://pv-magazine.com/2024/05/06/zambia-set-for-60-mw-20-mwh-of-solar-storage/	http://pv-magazine.com/2024/05/06/zambia-set-for-60-mw-20-mwh-of-solar-storage/
1	All-Energy and Dcarbonise Conference Programme	2024-05-16	NaN	['The renewables and low carbon energy community relishes attending All-Energy and Dcarbonise. In 2023, over 9,706 participants attended...	www.all-energy.co.uk	http://www.all-energy.co.uk/cfs	False	All-Energy and Dcarbonise Conference Programme	['The renewables and low carbon energy community relishes attending All-Energy and Dcarbonise. In 2023, over 9,706 participants attended...	The renewables and low carbon energy community relishes attending All-Energy and Dcarbonise. In 2023, over 9,706 participants attended t...	en	en	1.000000	1.0	2664	418	fc0b77c650cd9da58af79f07eb2f7b6a26540674	all\|conference\|dcarbonise\|energy\|programme	2024-05-16	all-energy.co.uk/cfs	True	http://www.all-energy.co.uk/cfs	http://www.all-energy.co.uk/cfs
2	API Challenging Court’ s Decision to Nullify GOM Oil, Natural Gas Auction	2022-02-09	NaN	['Sign in to get the best natural gas news and data. Follow the topics you want and receive the daily emails.', 'Your email address *', ...	www.naturalgasintel.com	http://www.naturalgasintel.com/api-challenging-courts-decision-to-nullify-gom-oil-natural-gas-auction/	False	API Challenging Court’ s Decision to Nullify GOM Oil, Natural Gas Auction	['Sign in to get the best natural gas news and data. Follow the topics you want and receive the daily emails.', 'Your email address *', ...	Follow the topics you want and receive the daily emails.\nYour email address \nYour password \nRemember me Continue\nReset password\nF...	en	en	0.999987	1.0	4602	751	bd9c3daf2d6c6f2667cba3cc1ccf4af8c34059bf	api\|auction\|challenging\|court\|decision\|gas\|gom\|natural\|nullify\|oil	2022-02-09	naturalgasintel.com/api-challenging-courts-decision-to-nullify-gom-oil-natural-gas-auction	True	http://www.naturalgasintel.com/api-challenging-courts-decision-to-nullify-gom-oil-natural-gas-auction/	http://www.naturalgasintel.com/api-challenging-courts-decision-to-nullify-gom-oil-natural-gas-auction/
3	Arizona Regulator Blocks Salt River Natural Gas Power Plant Expansion	2022-04-14	NaN	['Sign in to get the best natural gas news and data. Follow the topics you want and receive the daily emails.', 'Your email address *', ...	www.naturalgasintel.com	http://www.naturalgasintel.com/arizona-regulator-blocks-salt-river-natural-gas-power-plant-expansion/	False	Arizona Regulator Blocks Salt River Natural Gas Power Plant Expansion	['Sign in to get the best natural gas news and data. Follow the topics you want and receive the daily emails.', 'Your email address *', ...	Follow the topics you want and receive the daily emails.\nYour email address \nYour password \nRemember me Continue\nReset password\nF...	ca	en	0.510561	1.0	6740	1106	c0817579053284ca7921a1da09fc1d7720b1d81f	arizona\|blocks\|expansion\|gas\|natural\|plant\|power\|regulator\|river\|salt	2022-04-14	naturalgasintel.com/arizona-regulator-blocks-salt-river-natural-gas-power-plant-expansion	True	http://www.naturalgasintel.com/arizona-regulator-blocks-salt-river-natural-gas-power-plant-expansion/	http://www.naturalgasintel.com/arizona-regulator-blocks-salt-river-natural-gas-power-plant-expansion/
4	Biden Administration's 2023 Budget Doubles Down on Energy Transition	2022-03-29	NaN	['Sign in to get the best natural gas news and data. Follow the topics you want and receive the daily emails.', 'Your email address *', ...	www.naturalgasintel.com	http://www.naturalgasintel.com/biden-administrations-2023-budget-doubles-down-on-energy-transition/	False	Biden Administration's 2023 Budget Doubles Down on Energy Transition	['Sign in to get the best natural gas news and data. Follow the topics you want and receive the daily emails.', 'Your email address *', ...	Follow the topics you want and receive the daily emails.\nYour email address \nYour password \nRemember me Continue\nReset password\nF...	en	en	1.000000	1.0	5714	941	fc8285deefa73230c821637e0d4cd20b29eba383	2023\|administration\|biden\|budget\|doubles\|down\|energy\|transition	2022-03-29	naturalgasintel.com/biden-administrations-2023-budget-doubles-down-on-energy-transition	True	http://www.naturalgasintel.com/biden-administrations-2023-budget-doubles-down-on-energy-transition/	http://www.naturalgasintel.com/biden-administrations-2023-budget-doubles-down-on-energy-transition/

Describe numeric: content_char_count

	content_char_count
count	19341.0
mean	4723.312393
std	2782.727649
min	305.0
25%	2563.0
50%	4115.0
75%	6413.0
max	19684.0

Describe numeric: content_word_count

	content_word_count
count	19341.0
mean	748.544181
std	444.552426
min	50.0
25%	406.0
50%	654.0
75%	1007.0
max	2995.0

Top values in domain

	count
domain
www.energy-xprt.com	4058
www.pv-magazine.com	2945
www.azocleantech.com	2457
www.pv-tech.org	1966
cleantechnica.com	1752
www.thinkgeoenergy.com	1052
www.solarpowerportal.co.uk	850
www.solarpowerworldonline.com	778
www.energyvoice.com	635
solarindustrymag.com	620
solarquarter.com	590
www.rechargenews.com	557
www.naturalgasintel.com	295
www.energyintel.com	170
www.iea.org	158

Top values in lang_title

	count
lang_title
en	17075
it	809
hu	553
de	171
es	118
nl	103
fr	102
mt	66
da	37
pt	24
tr	24
eu	24
no	21
pl	17
fi	16

Top values in lang_content

	count
lang_content
en	19334
la	2
de	2
it	1
ru	1
es	1

Date coverage for published_dt

	non_null	min	max
published_dt	19341	2022-01-02	2024-10-24

media_dupe_links: C:\Users\gerbe\PycharmProjects\MT\cleantech_data\silver\media\2025-08-09\media_dupe_links.parquet

media_dupe_links¶

Text Only
shape: (149, 4)

Dtypes

	dtype
member_id	object
canonical_id	object
reason	object
group_key	object

Top 20 Null Fractions

	na_frac
member_id	0.0
canonical_id	0.0
reason	0.0
group_key	0.0

Head(5)

	member_id	canonical_id	reason	group_key
0	https://indorenergy.com/to-exhibit/	https://indorenergy.com/to-exhibit/	union
1	https://indorenergy.com/to-exhibit/	https://indorenergy.com/to-exhibit/	union
2	https://www.azocleantech.com/Suppliers.aspx?SupplierID=1870	https://www.azocleantech.com/suppliers.aspx?SupplierID=1870	content_sha1\|title_fp_in_domain_len90_dt7d	81e3be4966af51df918bcaeca6738445d3a63e5d\|www.azocleantech.com\|\|address\|contact\|inc\|materials\|mosaic\|quotes
3	https://www.energy-xprt.com/applications/page-4	https://www.energy-xprt.com/applications/page-5	title_fp_in_domain_len90_dt7d	www.energy-xprt.com\|\|applications\|energy\|industry\|page
4	https://www.energy-xprt.com/bioenergy/anaerobic-digestion/products/keyword-anaerobic-digester-system-62965	https://www.energy-xprt.com/bioenergy/anaerobic-digestion/products/keyword-anaerobic-digestion-system-19516	content_sha1	d255ef33658f43787e67a2c9feed5836bbc219cd

Duplicate reasons (media_dupe_links)

	count
reason
title_fp_in_domain_len90_dt7d	91
content_sha1	33
union	17
content_sha1\|title_fp_in_domain_len90_dt7d	8

Python
# PATENTS — canonical, duplicate links, normalized
pat_canon_path = find_file(patents_dir, "patent_canonical")
pat_links_path = find_file(patents_dir, "patent_dupe_links")
pat_norm_path  = find_file(patents_dir, "patents_normalized")

if pat_canon_path and pat_canon_path.exists():
    display(Markdown(f"**patent_canonical:** `{pat_canon_path}`"))
    pc = read_any(pat_canon_path)
    if "abstract_len" in pc.columns: pc["abstract_len"] = pd.to_numeric(pc["abstract_len"], errors="coerce")
    if "publication_date_dt" in pc.columns: pc["publication_date_dt"] = pd.to_datetime(pc["publication_date_dt"], errors="coerce")
    pc = basic_profile(pc, "patent_canonical")
    if "publication_number" in pc.columns:
        display(Markdown("**Duplicate publication_number rows (if any):**"))
        dup_frac = pc.duplicated(subset=["publication_number"], keep=False).mean()
        show_frac("duplicated_frac", dup_frac)
    if "abstract_len" in pc.columns and pd.api.types.is_numeric_dtype(pc["abstract_len"]):
        display(Markdown("**Describe numeric: `abstract_len`**")); display(pc["abstract_len"].describe().to_frame("abstract_len"))
    if "publication_date_dt" in pc.columns:
        display(Markdown("**Date coverage for `publication_date_dt`**")); display(date_range(pc, "publication_date_dt"))
else:
    display(Markdown("> ⚠️ `patent_canonical` not found."))

if pat_links_path and pat_links_path.exists():
    display(Markdown(f"**patent_dupe_links:** `{pat_links_path}`"))
    pl = read_any(pat_links_path)
    pl = basic_profile(pl, "patent_dupe_links")
else:
    display(Markdown("> ⚠️ `patent_dupe_links` not found."))

if pat_norm_path and pat_norm_path.exists():
    display(Markdown(f"**patents_normalized:** `{pat_norm_path}`"))
    pn = read_any(pat_norm_path)
    pn = basic_profile(pn, "patents_normalized")
    for c in ["cpc_codes","inventors","application_number","country_code"]:
        if c in pn.columns:
            sizes = pn[c].apply(cell_cardinality)
            display(Markdown(f"Cardinality of `{c}` per patent"))
            display(sizes.value_counts().sort_index().to_frame("count"))
else:
    display(Markdown("> ⚠️ `patents_normalized` not found."))

patent_canonical: C:\Users\gerbe\PycharmProjects\MT\cleantech_data\silver\patents\2025-08-09\patent_canonical.parquet

patent_canonical¶

Text Only
shape: (31364, 18)

Dtypes

	dtype
publication_date	int64
abstract_len	int64
title_len	int64
lang_title_score	float64
lang_abs_score	float64
eng	float64
publication_number	object
application_number	object
country_code	object
title	object
abstract	object
inventor	object
cpc_code	object
title_clean	object
abstract_clean	object
lang_title	object
lang_abs	object
publication_date_dt	datetime64[ns]

Top 20 Null Fractions

	na_frac
lang_title_score	0.000128
publication_number	0.000000
application_number	0.000000
country_code	0.000000
abstract	0.000000
title	0.000000
inventor	0.000000
cpc_code	0.000000
title_clean	0.000000
publication_date	0.000000
abstract_clean	0.000000
abstract_len	0.000000
publication_date_dt	0.000000
lang_title	0.000000
lang_abs	0.000000
lang_abs_score	0.000000
eng	0.000000
title_len	0.000000

Head(5)

	publication_number	application_number	country_code	title	abstract	publication_date	inventor	cpc_code	title_clean	abstract_clean	abstract_len	publication_date_dt	lang_title	lang_title_score	lang_abs	lang_abs_score	eng	title_len
0	AU-2016208290-B2	AU-2016208290-A	AU	Closed loop control system for heliostats	CLOSED LOOP CONTROL SYSTEM FOR HELIOSTATS \nAbstract \nDescribed herein is a control system for a solar energy collection apparatus (1)....	20220317	[BURTON, ALEXANDER]	G05D3/12	Closed loop control system for heliostats	CLOSED LOOP CONTROL SYSTEM FOR HELIOSTATS\nAbstract\nDescribed herein is a control system for a solar energy collection apparatus (1). T...	987	2022-03-17	en	0.999495	en	1.0	1.0	41
1	AU-2016321918-B2	AU-2016321918-A	AU	Device for capturing solar energy	The invention relates to a device for capturing solar energy, including a plurality of single-axis solar trackers (S1-S5), each one havi...	20220407	[LÓPEZ ONA, Sergio, ROS RUÍZ, Antonio José]	H02S40/34	Device for capturing solar energy	The invention relates to a device for capturing solar energy, including a plurality of single-axis solar trackers (S1-S5), each one havi...	827	2022-04-07	en	0.920249	en	1.0	1.0	33
2	AU-2017246326-B2	AU-2017246326-A	AU	Combined window shade and solar panel	A window shade system includes a mounting bracket configured to couple to a structure, a shade tube bracket configured to rotatably coup...	20220526	[GEIGER, JAMES]	E06B9/40	Combined window shade and solar panel	A window shade system includes a mounting bracket configured to couple to a structure, a shade tube bracket configured to rotatably coup...	506	2022-05-26	en	1.000000	en	1.0	1.0	37
3	AU-2017267740-B2	AU-2017267740-A	AU	System and methods for improving the accuracy of solar energy and wind energy forecasts for an electric utility grid	A computer system and method for improving the accuracy of predictions of the amount of renewable energy, such as solar energy and wind ...	20220324	[FORBES, KEVIN F., ZAMPELLI, ERNEST M.]	G06Q50/06	System and methods for improving the accuracy of solar energy and wind energy forecasts for an electric utility grid	A computer system and method for improving the accuracy of predictions of the amount of renewable energy, such as solar energy and wind ...	810	2022-03-24	en	1.000000	en	1.0	1.0	116
4	AU-2017276466-B2	AU-2017276466-A	AU	Ocean carbon capture and storage method and device	Provided is an ocean carbon capture and storage (ocean CCS) method and device for performing carbon capture and storage with respect to ...	20220602	[PENG, SIGAN]	B01D53/1475	Ocean carbon capture and storage method and device	Provided is an ocean carbon capture and storage (ocean CCS) method and device for performing carbon capture and storage with respect to ...	881	2022-06-02	en	1.000000	en	1.0	1.0	50

Duplicate publication_number rows (if any):

duplicated_frac: 0.0000

Describe numeric: abstract_len

	abstract_len
count	31364.000000
mean	1177.386143
std	270.049041
min	64.000000
25%	1036.000000
50%	1235.000000
75%	1359.000000
max	3201.000000

Date coverage for publication_date_dt

	non_null	min	max
publication_date_dt	31364	2022-01-01	2024-09-05

patent_dupe_links: C:\Users\gerbe\PycharmProjects\MT\cleantech_data\silver\patents\2025-08-09\patent_dupe_links.parquet

patent_dupe_links¶

Text Only
shape: (375362, 3)

Dtypes

	dtype
member_id	object
canonical_id	object
reason	object

Top 20 Null Fractions

	na_frac
member_id	0.0
canonical_id	0.0
reason	0.0

Head(5)

	member_id	canonical_id	reason
0	AU-2016208290-A	AU-2016208290-A	longer
1	AU-2016321918-A	AU-2016321918-A	longer
2	AU-2016321918-A	AU-2016321918-A	longer
3	AU-2016321918-A	AU-2016321918-A	longer
4	AU-2016321918-A	AU-2016321918-A	longer

patents_normalized: C:\Users\gerbe\PycharmProjects\MT\cleantech_data\silver\patents\2025-08-09\patents_normalized.parquet

patents_normalized¶

Text Only
shape: (31364, 13)

Dtypes

	dtype
lang_title_score	float64
lang_abs_score	float64
publication_number	object
cpc_codes	object
inventors	object
application_number	object
country_codes	object
country_code	object
title_clean	object
abstract_clean	object
lang_title	object
lang_abs	object
publication_date_dt	datetime64[ns]

Top 20 Null Fractions

	na_frac
lang_title_score	0.000128
cpc_codes	0.000000
inventors	0.000000
application_number	0.000000
publication_number	0.000000
country_codes	0.000000
country_code	0.000000
abstract_clean	0.000000
title_clean	0.000000
publication_date_dt	0.000000
lang_title	0.000000
lang_abs	0.000000
lang_abs_score	0.000000

Head(5)

	publication_number	cpc_codes	inventors	application_number	country_codes	country_code	title_clean	abstract_clean	publication_date_dt	lang_title	lang_title_score	lang_abs	lang_abs_score
0	AU-2016208290-B2	[F24S2050/25, G05D3/12]	[BURTON, ALEXANDER]	[AU-2016208290-A]	[AU]	AU	Closed loop control system for heliostats	CLOSED LOOP CONTROL SYSTEM FOR HELIOSTATS\nAbstract\nDescribed herein is a control system for a solar energy collection apparatus (1). T...	2022-03-17	en	0.999495	en	1.0
1	AU-2016321918-B2	[F24S2030/11, F24S2030/15, F24S25/12, F24S30/425, H02S20/32, H02S40/34, Y02E10/47, Y02E10/50]	[LÓPEZ ONA, Sergio, ROS RUÍZ, Antonio José]	[AU-2016321918-A]	[AU]	AU	Device for capturing solar energy	The invention relates to a device for capturing solar energy, including a plurality of single-axis solar trackers (S1-S5), each one havi...	2022-04-07	en	0.920249	en	1.0
2	AU-2017246326-B2	[E06B2009/2476, E06B2009/6827, E06B9/40, E06B9/42, E06B9/50, E06B9/68, H02S30/20, Y02E10/50]	[GEIGER, JAMES]	[AU-2017246326-A]	[AU]	AU	Combined window shade and solar panel	A window shade system includes a mounting bracket configured to couple to a structure, a shade tube bracket configured to rotatably coup...	2022-05-26	en	1.000000	en	1.0
3	AU-2017267740-B2	[F05B2260/8211, G01W1/10, G01W2001/006, G01W2201/00, G01W2203/00, G06F16/23, G06F17/10, G06F17/18, G06N20/00, G06Q10/04, G06Q50/06]	[FORBES, KEVIN F., ZAMPELLI, ERNEST M.]	[AU-2017267740-A]	[AU]	AU	System and methods for improving the accuracy of solar energy and wind energy forecasts for an electric utility grid	A computer system and method for improving the accuracy of predictions of the amount of renewable energy, such as solar energy and wind ...	2022-03-24	en	1.000000	en	1.0
4	AU-2017276466-B2	[B01D2252/1035, B01D2257/504, B01D2258/012, B01D2258/0283, B01D2259/4566, B01D53/1412, B01D53/1475, B01D53/18, B01D53/62, Y02A50/20, Y02...	[PENG, SIGAN]	[AU-2017276466-A]	[AU]	AU	Ocean carbon capture and storage method and device	Provided is an ocean carbon capture and storage (ocean CCS) method and device for performing carbon capture and storage with respect to ...	2022-06-02	en	1.000000	en	1.0

Cardinality of cpc_codes per patent

	count
cpc_codes
1	9907
2	2793
3	2750
4	2656
5	2619
6	2252
7	1825
8	1450
9	1148
10	877
11	682
12	511
13	391
14	299
15	199
16	226
17	123
18	124
19	99
20	58
21	51
22	45
23	38
24	30
25	37
26	29
27	15
28	24
29	3
30	17
31	8
32	5
33	2
34	17
35	5
36	2
37	1
39	4
40	3
41	5
45	5
46	15
47	2
49	6
52	4
72	2

Cardinality of inventors per patent

	count
inventors
0	6989
1	6024
2	3086
3	3437
4	2475
5	2458
6	1866
7	1297
8	981
9	689
10	616
11	359
12	292
13	181
14	148
15	134
16	91
17	50
18	50
19	29
20	40
21	9
22	17
23	9
24	10
25	6
26	4
27	7
28	1
29	1
30	1
34	1
37	1
38	2
44	1
45	1
47	1

Cardinality of application_number per patent

	count
application_number
1	31364

Cardinality of country_code per patent

	count
country_code
1	31364

Python
# OPENALEX — canonical, M2M, refs (updated columns)
t_canon_path = find_file(openalex_dir, "topics_canonical")
kw_m2m_path  = find_file(openalex_dir, "topic_keywords_m2m")
sib_m2m_path = find_file(openalex_dir, "topic_siblings_m2m")
dom_ref_path = find_file(openalex_dir, "domains_ref")
fld_ref_path = find_file(openalex_dir, "fields_ref")
sub_ref_path = find_file(openalex_dir, "subfields_ref")

topics = None
if t_canon_path and t_canon_path.exists():
    display(Markdown(f"**topics_canonical:** `{t_canon_path}`"))
    topics = read_any(t_canon_path)
    # Updated pipeline uses: display_name, description, works_count, cited_by_count,
    # created_date, updated_date, lang_display_name, lang_description, domain_id/field_id/subfield_id (+ names)
    for c in ["works_count","cited_by_count"]:
        if c in topics.columns:
            topics[c] = pd.to_numeric(topics[c], errors="coerce")
    # Add on-the-fly length columns just for profiling (if you want, optional)
    if "display_name" in topics.columns:
        topics["display_name_len"] = topics["display_name"].astype(str).str.len()
    if "description" in topics.columns:
        topics["description_len"] = topics["description"].astype(str).str.len()

    topics = basic_profile(topics, "topics_canonical")
    for c in ["domain_name","field_name","subfield_name","lang_display_name","lang_description"]:
        if c in topics.columns:
            display(Markdown(f"Top values in `{c}`")); display(topics[c].value_counts().head(15).to_frame("count"))
    for c in ["description_len","display_name_len","works_count","cited_by_count"]:
        if c in topics.columns and pd.api.types.is_numeric_dtype(topics[c]):
            display(Markdown(f"**Describe numeric: `{c}`**")); display(topics[c].describe().to_frame(c))
    # Date coverage (new names)
    for c in ["created_date","updated_date"]:
        if c in topics.columns:
            display(Markdown(f"**Date coverage for `{c}`**")); display(date_range(topics, c))
else:
    display(Markdown("> ⚠️ `topics_canonical` not found."))

if kw_m2m_path and kw_m2m_path.exists():
    display(Markdown(f"**topic_keywords_m2m:** `{kw_m2m_path}`"))
    kw = read_any(kw_m2m_path); kw = basic_profile(kw, "topic_keywords_m2m")
    if topics is not None and "topic_id" in kw.columns and "topic_id" in topics.columns:
        display(Markdown("**FK check** `topic_keywords_m2m.topic_id → topics_canonical.topic_id`"))
        show_frac("unmatched_frac", (~kw["topic_id"].isin(topics["topic_id"])).mean())
else:
    display(Markdown("> ⚠️ `topic_keywords_m2m` not found."))

if sib_m2m_path and sib_m2m_path.exists():
    display(Markdown(f"**topic_siblings_m2m:** `{sib_m2m_path}`"))
    sib = read_any(sib_m2m_path); sib = basic_profile(sib, "topic_siblings_m2m")
    if topics is not None and "topic_id" in topics.columns:
        display(Markdown("**FK checks** (both ends should exist in topics_canonical)"))
        src_unmatched = (~sib["topic_id"].isin(topics["topic_id"])).mean() if "topic_id" in sib.columns else 1.0
        dst_unmatched = (~sib["sibling_topic_id"].isin(topics["topic_id"])).mean() if "sibling_topic_id" in sib.columns else 1.0
        show_frac("edge_src_unmatched_frac", src_unmatched)
        show_frac("edge_dst_unmatched_frac", dst_unmatched)
else:
    display(Markdown("> ⚠️ `topic_siblings_m2m` not found."))

if dom_ref_path and dom_ref_path.exists():
    display(Markdown(f"**domains_ref:** `{dom_ref_path}`"))
    dom = read_any(dom_ref_path); dom = basic_profile(dom, "domains_ref")
    if topics is not None and {"domain_id","id"}.issubset(set(topics.columns) | set(dom.columns)):
        display(Markdown("**Join coverage:** topics.domain_id → domains_ref.id"))
        show_frac("unmatched_frac", (~topics["domain_id"].isin(dom["id"])).mean())
if fld_ref_path and fld_ref_path.exists():
    display(Markdown(f"**fields_ref:** `{fld_ref_path}`"))
    fld = read_any(fld_ref_path); fld = basic_profile(fld, "fields_ref")
    if topics is not None and {"field_id","id"}.issubset(set(topics.columns) | set(fld.columns)):
        display(Markdown("**Join coverage:** topics.field_id → fields_ref.id"))
        show_frac("unmatched_frac", (~topics["field_id"].isin(fld["id"])).mean())
if sub_ref_path and sub_ref_path.exists():
    display(Markdown(f"**subfields_ref:** `{sub_ref_path}`"))
    sub = read_any(sub_ref_path); sub = basic_profile(sub, "subfields_ref")
    if topics is not None and {"subfield_id","id"}.issubset(set(topics.columns) | set(sub.columns)):
        display(Markdown("**Join coverage:** topics.subfield_id → subfields_ref.id"))
        show_frac("unmatched_frac", (~topics["subfield_id"].isin(sub["id"])).mean())

topics_canonical: C:\Users\gerbe\PycharmProjects\MT\cleantech_data\silver\openalex\2025-08-09\topics_canonical.parquet

topics_canonical¶

Text Only
shape: (4516, 17)

Dtypes

	dtype
works_count	int64
cited_by_count	int64
display_name_len	int64
description_len	int64
topic_id	object
display_name	object
description	object
created_date	object
updated_date	object
lang_display_name	object
lang_description	object
domain_id	object
domain_name	object
field_id	object
field_name	object
subfield_id	object
subfield_name	object

Top 20 Null Fractions

	na_frac
topic_id	0.0
display_name	0.0
description	0.0
works_count	0.0
cited_by_count	0.0
created_date	0.0
updated_date	0.0
lang_display_name	0.0
lang_description	0.0
domain_id	0.0
domain_name	0.0
field_id	0.0
field_name	0.0
subfield_id	0.0
subfield_name	0.0
display_name_len	0.0
description_len	0.0

Head(5)

	topic_id	display_name	description	works_count	cited_by_count	created_date	updated_date	lang_display_name	lang_description	domain_id	domain_name	field_id	field_name	subfield_id	subfield_name	display_name_len	description_len
0	T11881	Crystallization and Solubility Studies	This cluster of papers focuses on the crystallization processes and control, including topics such as nucleation, solubility, polymorphi...	975416	792291	2024-01-23	2025-08-04T05:30:13.285909	en	en	3	Physical Sciences	25	Materials Science	2505	Materials Chemistry	38	308
1	T11475	French Urban and Social Studies	This cluster of papers explores the intersection of territorial governance, environmental participation, and sustainable development. It...	656763	703244	2024-01-23	2025-08-04T05:21:24.303694	en	en	2	Social Sciences	33	Social Sciences	3312	Sociology and Political Science	31	412
2	T13445	American Constitutional Law and Politics	This cluster of papers explores the development of American political thought and history, focusing on topics such as the American found...	467988	1823982	2024-01-23	2025-08-04T05:26:43.888907	en	en	2	Social Sciences	33	Social Sciences	3320	Political Science and International Relations	40	327
3	T10165	Classical Antiquity Studies	This cluster of papers explores various aspects of ancient Mediterranean civilizations, including the society, economy, literature, and ...	395325	1639281	2024-01-23	2025-08-04T05:20:01.858253	en	en	2	Social Sciences	33	Social Sciences	3314	Anthropology	27	354
4	T10362	Biblical Studies and Interpretation	This cluster of papers focuses on biblical studies, ancient Judaism, early Christianity, and Christian theology. It explores topics such...	343175	1059942	2024-01-23	2025-08-04T06:01:56.990280	en	en	2	Social Sciences	12	Arts and Humanities	1212	Religious studies	35	320

Top values in domain_name

	count
domain_name
Physical Sciences	1571
Social Sciences	1487
Health Sciences	844
Life Sciences	614

Top values in field_name

	count
field_name
Social Sciences	764
Medicine	692
Engineering	560
Computer Science	302
Arts and Humanities	266
Biochemistry, Genetics and Molecular Biology	248
Agricultural and Biological Sciences	235
Environmental Science	202
Business, Management and Accounting	146
Psychology	144
Materials Science	123
Health Professions	107
Economics, Econometrics and Finance	107
Physics and Astronomy	104
Chemistry	101

Top values in subfield_name

	count
subfield_name
Sociology and Political Science	224
Education	137
Molecular Biology	135
Political Science and International Relations	118
Electrical and Electronic Engineering	117
Surgery	96
Plant Science	90
Economics and Econometrics	79
Mechanical Engineering	79
Information Systems	78
Artificial Intelligence	77
Biomedical Engineering	72
Materials Chemistry	69
Organic Chemistry	61
Public Health, Environmental and Occupational Health	54

Top values in lang_display_name

	count
lang_display_name
en	4516

Top values in lang_description

	count
lang_description
en	4516

Describe numeric: description_len

	description_len
count	4516.000000
mean	383.544066
std	72.346466
min	192.000000
25%	330.750000
50%	378.000000
75%	430.000000
max	828.000000

Describe numeric: display_name_len

	display_name_len
count	4516.000000
mean	36.277015
std	6.990049
min	14.000000
25%	31.000000
50%	36.000000
75%	40.000000
max	83.000000

Describe numeric: works_count

	works_count
count	4516.000000
mean	49773.929805
std	46035.488858
min	69.000000
25%	20376.750000
50%	38681.500000
75%	67517.250000
max	975416.000000

Describe numeric: cited_by_count

	cited_by_count
count	4.516000e+03
mean	5.897473e+05
std	7.457060e+05
min	6.000000e+01
25%	9.836400e+04
50%	3.315000e+05
75%	7.917502e+05
max	8.149172e+06

Date coverage for created_date

	non_null	min	max
created_date	4516	2024-01-23	2024-01-23

Date coverage for updated_date

	non_null	min	max
updated_date	4516	2024-03-04 04:59:34.287941	2025-08-04 06:06:45.191053

topic_keywords_m2m: C:\Users\gerbe\PycharmProjects\MT\cleantech_data\silver\openalex\2025-08-09\topic_keywords_m2m.parquet

topic_keywords_m2m¶

Text Only
shape: (45154, 3)

Dtypes

	dtype
topic_id	object
keyword_norm	object
keyword_orig	object

Top 20 Null Fractions

	na_frac
topic_id	0.0
keyword_norm	0.0
keyword_orig	0.0

Head(5)

	topic_id	keyword_norm	keyword_orig
0	T11881	crystallization	Crystallization
1	T11881	nucleation	Nucleation
2	T11881	solubility	Solubility
3	T11881	polymorphism	Polymorphism
4	T11881	ultrasound_assisted_crystallization	Ultrasound-Assisted Crystallization

FK check topic_keywords_m2m.topic_id → topics_canonical.topic_id

unmatched_frac: 0.0000

topic_siblings_m2m: C:\Users\gerbe\PycharmProjects\MT\cleantech_data\silver\openalex\2025-08-09\topic_siblings_m2m.parquet

topic_siblings_m2m¶

Text Only
shape: (243006, 2)

Dtypes

	dtype
topic_id	object
sibling_topic_id	object

Top 20 Null Fractions

	na_frac
topic_id	0.0
sibling_topic_id	0.0

Head(5)

	topic_id	sibling_topic_id
0	T11881	T10275
1	T11881	T13889
2	T11881	T12302
3	T11881	T10440
4	T11881	T10311

FK checks (both ends should exist in topics_canonical)

edge_src_unmatched_frac: 0.0000

edge_dst_unmatched_frac: 0.0000

domains_ref: C:\Users\gerbe\PycharmProjects\MT\cleantech_data\silver\openalex\2025-08-09\domains_ref.parquet

domains_ref¶

Text Only
shape: (4, 2)

Dtypes

	dtype
id	object
name	object

Top 20 Null Fractions

	na_frac
id	0.0
name	0.0

Head(5)

	id	name
0	1	Life Sciences
1	2	Social Sciences
2	3	Physical Sciences
3	4	Health Sciences

Join coverage: topics.domain_id → domains_ref.id

unmatched_frac: 0.0000

fields_ref: C:\Users\gerbe\PycharmProjects\MT\cleantech_data\silver\openalex\2025-08-09\fields_ref.parquet

fields_ref¶

Text Only
shape: (26, 2)

Dtypes

	dtype
id	object
name	object

Top 20 Null Fractions

	na_frac
id	0.0
name	0.0

Head(5)

	id	name
0	11	Agricultural and Biological Sciences
1	12	Arts and Humanities
2	13	Biochemistry, Genetics and Molecular Biology
3	14	Business, Management and Accounting
4	15	Chemical Engineering

Join coverage: topics.field_id → fields_ref.id

unmatched_frac: 0.0000

subfields_ref: C:\Users\gerbe\PycharmProjects\MT\cleantech_data\silver\openalex\2025-08-09\subfields_ref.parquet

subfields_ref¶

Text Only
shape: (252, 2)

Dtypes

	dtype
id	object
name	object

Top 20 Null Fractions

	na_frac
id	0.0
name	0.0

Head(5)

	id	name
0	1100	General Agricultural and Biological Sciences
1	1102	Agronomy and Crop Science
2	1103	Animal Science and Zoology
3	1104	Aquatic Science
4	1105	Ecology, Evolution, Behavior and Systematics

Join coverage: topics.subfield_id → subfields_ref.id

unmatched_frac: 0.0000

Unify → `silver/unified/unified_docs.parquet`¶

Rules (unchanged, adapted to updated column names where necessary):

Media (from media_canonical)
doc_id = "media:" + sha1(url_key) (fallback to url_clean/url if url_key is missing)
title = title_clean
text = content_clean
date = published_dt
lang = lang_content if present else lang_title
url = url_clean
source = "kaggle.media"
Patents (from patents_normalized)
doc_id = "patent:" + publication_number
title = title_clean
text = abstract_clean
date = publication_date_dt
lang = lang_abs if present else lang_title
country = country_code
cpc_codes = list[str]
source = "kaggle.patent"
Topics (from topics_canonical)
doc_id = "topic:" + topic_id
title = display_name
text = description
date = updated_date or created_date
lang = lang_description or lang_display_name
url = topic_url (if present; else NA)
source = "openalex.topics"

Python
# ------------------------------ unify helpers ------------------------------

def _sha1_hex(text: str) -> str:
    return hashlib.sha1((text or "").encode("utf-8", "ignore")).hexdigest()

def _to_datetime(s: pd.Series) -> pd.Series:
    return pd.to_datetime(s, errors="coerce")

def _prefer(*vals):
    for v in vals:
        if v is not None and pd.notna(v) and str(v).strip() != "":
            return v
    return None

def _to_list_str(val: Any) -> list[str]:
    if val is None:
        return []
    try:
        if pd.isna(val):
            return []
    except Exception:
        pass
    if isinstance(val, (list, tuple, set)):
        return [str(x) for x in val if str(x).strip()]
    if isinstance(val, str):
        s = val.strip()
        if len(s) >= 2 and s[0] == "[" and s[-1] == "]":
            try:
                parsed = ast.literal_eval(s)
                if isinstance(parsed, (list, tuple, set)):
                    return [str(x) for x in parsed if str(x).strip()]
            except Exception:
                return [s] if s else []
        return [s] if s else []
    return [str(val)] if str(val).strip() else []

def _finalize_unified(df: pd.DataFrame) -> pd.DataFrame:
    # enforce dtypes
    out = df.copy()
    out["doc_type"] = out["doc_type"].astype("category")
    out["date"] = pd.to_datetime(out["date"], errors="coerce")
    # Normalize missing URL/country
    if "url" in out.columns:
        out["url"] = out["url"].astype("string")
    if "country" in out.columns:
        out["country"] = out["country"].astype("string")
    # cpc_codes must be a list[str]
    if "cpc_codes" in out.columns:
        out["cpc_codes"] = out["cpc_codes"].map(_to_list_str)
    # essential fields
    essential = out["doc_id"].notna() & out["title"].notna() & out["text"].notna() & out["date"].notna()
    dropped = len(out) - int(essential.sum())
    if dropped:
        print(f"[unify] dropping {dropped} rows missing essential fields (doc_id/title/text/date)")
    out = out[essential].copy()
    out = out.drop_duplicates(subset=["doc_id"], keep="first")
    out = out.sort_values(["doc_type","doc_id"]).reset_index(drop=True)
    return out

# ------------------------------ unify build ------------------------------

unified_rows: list[dict] = []

# Media view
if 'media' in globals() and isinstance(media, pd.DataFrame) and not media.empty:
    m = media.copy()
    # prefer url_key; fallback to url_clean/url
    def _m_doc_id(row):
        key = (row.get("url_key") or row.get("url_clean") or row.get("url") or "").strip()
        return "media:" + _sha1_hex(key) if key else None

    mv = pd.DataFrame({
        "doc_id": m.apply(_m_doc_id, axis=1),
        "doc_type": "media",
        "title": m.get("title_clean", m.get("title", pd.Series(index=m.index))).astype(str),
        "text":  m.get("content_clean", m.get("content", pd.Series(index=m.index))).astype(str),
        "date":  _to_datetime(m.get("published_dt", pd.Series(pd.NaT, index=m.index))),
        "lang":  (m.get("lang_content") if "lang_content" in m.columns else m.get("lang_title")).fillna("unknown").astype(str),
        "source": "kaggle.media",
        "url":   m.get("url_clean", m.get("url", pd.Series(index=m.index))).astype(str),
        "country": pd.Series([None]*len(m), index=m.index, dtype="object"),
        "cpc_codes": pd.Series([[]]*len(m), index=m.index, dtype="object"),
    })
    unified_rows.append(mv)

# Patents view (from normalized)
if 'pn' in globals() and isinstance(pn, pd.DataFrame) and not pn.empty:
    p = pn.copy()
    pv = pd.DataFrame({
        "doc_id": "patent:" + p["publication_number"].astype(str),
        "doc_type": "patent",
        "title": p.get("title_clean", pd.Series(index=p.index)).astype(str),
        "text":  p.get("abstract_clean", pd.Series(index=p.index)).astype(str),
        "date":  _to_datetime(p.get("publication_date_dt", pd.Series(pd.NaT, index=p.index))),
        "lang":  p.get("lang_abs", p.get("lang_title", pd.Series(index=p.index))).fillna("unknown").astype(str),
        "source": "kaggle.patent",
        "url":   pd.Series([None]*len(p), index=p.index, dtype="object"),
        "country": p.get("country_code", pd.Series(index=p.index)).astype("string"),
        "cpc_codes": p.get("cpc_codes", pd.Series([[]]*len(p), index=p.index, dtype="object")).map(_to_list_str),
    })
    unified_rows.append(pv)

# Topics view (updated column names)
if 'topics' in globals() and isinstance(topics, pd.DataFrame) and not topics.empty:
    t = topics.copy()
    date_series = _to_datetime(t.get("updated_date", pd.Series(pd.NaT, index=t.index)))
    if date_series.isna().all():
        date_series = _to_datetime(t.get("created_date", pd.Series(pd.NaT, index=t.index)))
    lang_series = t.get("lang_description", t.get("lang_display_name", pd.Series(index=t.index))).fillna("unknown").astype(str)
    tv = pd.DataFrame({
        "doc_id": "topic:" + t["topic_id"].astype(str),
        "doc_type": "topic",
        "title": t.get("display_name", pd.Series(index=t.index)).astype(str),
        "text":  t.get("description", pd.Series(index=t.index)).astype(str),
        "date":  date_series,
        "lang":  lang_series,
        "source": "openalex.topics",
        "url":   t.get("topic_url", pd.Series([None]*len(t), index=t.index, dtype="object")).astype("string"),
        "country": pd.Series([None]*len(t), index=t.index, dtype="object"),
        "cpc_codes": pd.Series([[]]*len(t), index=t.index, dtype="object"),
    })
    unified_rows.append(tv)

if not unified_rows:
    raise RuntimeError("No input frames were loaded; cannot build unified docs. Check earlier validation steps.")

unified_df = pd.concat(unified_rows, ignore_index=True)
unified_df = _finalize_unified(unified_df)

display(Markdown("### Unified sample"))
display(unified_df.head(8))
display(Markdown("**Counts by doc_type**")); display(unified_df["doc_type"].value_counts().to_frame("count"))
display(Markdown("**Top languages**")); display(unified_df["lang"].value_counts().head(12).to_frame("count"))
display(Markdown("**Date coverage**")); display(unified_df.groupby("doc_type")["date"].agg(["min","max","count"]))

# Write with explicit Arrow schema (cpc_codes as list<string>), fallback to pandas parquet
unified_dir = root / "silver" / "unified"
unified_dir.mkdir(parents=True, exist_ok=True)
pq_path = unified_dir / "unified_docs.parquet"

try:
    import pyarrow as pa, pyarrow.parquet as pq
    # Build schema; allow None for url/country
    schema = pa.schema([
        pa.field("doc_id", pa.string()),
        pa.field("doc_type", pa.string()),
        pa.field("title", pa.string()),
        pa.field("text", pa.string()),
        pa.field("date", pa.timestamp("ns")),
        pa.field("lang", pa.string()),
        pa.field("source", pa.string()),
        pa.field("url", pa.string()),
        pa.field("country", pa.string()),
        pa.field("cpc_codes", pa.list_(pa.string())),
    ])
    # Ensure list[str]
    udf = unified_df.copy()
    udf["cpc_codes"] = udf["cpc_codes"].map(_to_list_str)
    table = pa.Table.from_pandas(udf[list(schema.names)], schema=schema, preserve_index=False)
    pq.write_table(table, pq_path)
    print(f"[unify] Wrote Arrow-parquet with explicit schema → {pq_path}")
except Exception as exc:
    print(f"[unify] Arrow write failed ({type(exc).__name__}: {exc}); falling back to pandas.to_parquet")
    unified_df.to_parquet(pq_path, index=False)
    print(f"[unify] Wrote pandas parquet → {pq_path}")

# quick sanity echo
print("Rows:", len(unified_df), "| Path:", pq_path)

Text Only
C:\Users\gerbe\AppData\Local\Temp\ipykernel_66220\2918688680.py:127: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.
  unified_df = pd.concat(unified_rows, ignore_index=True)

Unified sample¶

	doc_id	doc_type	title	text	date	lang	source	url	country	cpc_codes
0	media:000999be09894ed5b098ed528fe55b30e3e26054	media	JinkoSolar and Marubeni Deliver 6MWh of SunTera ESS to Kitakyushu, Japan, Enhancing Efficient and Safe Energy Management – pv magazine International	Recently, JinkoSolar has successfully signed a supply agreement with Marubeni Corporation for two 3MWh SunTera energy storage systems, providing a total of ...	2024-10-23	en	kaggle.media	https://www.pv-magazine.com/press-releases/jinkosolar-and-marubeni-deliver-6mwh-of-suntera-ess-to-kitakyushu-japan-enhancing-efficient-and-safe-energy-manag...	<NA>	[]
1	media:000d800574212a3f7263a0aa491e38d0fbad660f	media	Combustion Equipment ( Energy Management) Videos	The GCU evo ( Gas Combustion Unit) developed by SAACKE makes the safe transport of liquefied natural gas ( LNG) at sea possible. It utilises surplus boil-of...	2023-07-26	en	kaggle.media	https://www.energy-xprt.com/energy-management/combustion-equipment/videos	<NA>	[]
2	media:0016fb2f14828861db0b2f07275e119dad2d44c3	media	Wood appointed owner's engineer for UK’ s first commercial lithium refinery	Mineral processing company Green Lithium has appointed Wood as its owner’ s engineer, as it progresses plans to build and operate the UK’ s first large-scal...	2022-01-24	en	kaggle.media	https://www.energyvoice.com/renewables-energy-transition/381915/wood-appointed-owners-engineer-for-uks-first-commercial-lithium-refinery/	<NA>	[]
3	media:0016fc189cc7a6b4b1b6992dd511ba6ef7710564	media	Energy Technology Perspectives 2020 – Analysis	or subcribe to a paid service.\nTo avoid the worst consequences of climate change, the global energy system must rapidly reduce its emissions. Calls to redu...	2024-02-03	en	kaggle.media	https://www.iea.org/reports/energy-technology-perspectives-2020	<NA>	[]
4	media:0017c5e566b4104ef39b4fe93b8068b870644e2a	media	'Wartime footing ' upgrade needed as grid stalls offshore wind plans, warns UK champion	The UK was warned that availability of grid connections is emerging as the number-one obstacle to its offshore wind ambitions, as the government’ s own ‘ ch...	2023-04-05	en	kaggle.media	https://www.rechargenews.com/news/2-1-1431535	<NA>	[]
5	media:0018edf9110251ce52dd8bfd30f7b6dc011e9f62	media	Hailo Wind Systems Gmbh & Co. Kg Profile	The wind industry is characterised by its great innovative force, requiring a high degree of reliability, flexibility, and perseverance. Only those companie...	2023-06-14	en	kaggle.media	https://www.energy-xprt.com/companies/hailo-wind-systems-gmbh-co-kg-125537	<NA>	[]
6	media:001c7395ac57425f1bb0440cc36d2e6bccdd4858	media	New method to avoid overvoltage in vertical PV linked to low-voltage grids – pv magazine International	A research group in Finland has created a new workflow to prevent voltage rises in bifacial vertical PV systems located at high latitudes. According to thei...	2023-07-03	en	kaggle.media	https://www.pv-magazine.com/2023/07/03/new-method-to-avoid-overvoltage-in-vertical-pv-linked-to-low-voltage-grids/	<NA>	[]
7	media:00241ce5830570c00ee65588e48c5430bdbbb492	media	SolarEV City Concept Could Be Viable in High-Latitude Cities Like Paris	By continuing to browse this site you agree to our use of cookies. More info.\nThe iconic city of Paris is synonymous with climate change, thanks in part to...	2023-09-19	en	kaggle.media	https://www.azocleantech.com/news.aspx?newsID=34010	<NA>	[]

Counts by doc_type

	count
doc_type
patent	31364
media	19341
topic	4516

Top languages

	count
lang
en	52729
zh	2210
ko	133
fr	91
de	27
ja	7
pl	6
ar	3
ru	3
pt	2
cs	2
la	2

Date coverage

Text Only
C:\Users\gerbe\AppData\Local\Temp\ipykernel_66220\2918688680.py:134: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  display(Markdown("**Date coverage**")); display(unified_df.groupby("doc_type")["date"].agg(["min","max","count"]))

	min	max	count
doc_type
media	2022-01-02 00:00:00.000000	2024-10-24 00:00:00.000000	19341
patent	2022-01-01 00:00:00.000000	2024-09-05 00:00:00.000000	31364
topic	2024-03-04 04:59:34.287941	2025-08-04 06:06:45.191053	4516

Text Only
[unify] Wrote Arrow-parquet with explicit schema → C:\Users\gerbe\PycharmProjects\MT\cleantech_data\silver\unified\unified_docs.parquet
Rows: 55221 | Path: C:\Users\gerbe\PycharmProjects\MT\cleantech_data\silver\unified\unified_docs.parquet

Cleantech Silver Validation + Unification (Latest Buckets Version 2025-08-09)¶

Minimal unified schema¶

Files in Media bucket¶

Files in Patents bucket¶

Files in OpenAlex bucket¶

media_canonical¶

media_dupe_links¶

patent_canonical¶

patent_dupe_links¶

patents_normalized¶

topics_canonical¶

topic_keywords_m2m¶

topic_siblings_m2m¶

domains_ref¶

fields_ref¶

subfields_ref¶

Unify → silver/unified/unified_docs.parquet¶

Unified sample¶

Unify → `silver/unified/unified_docs.parquet`¶