Skip to content

Cleantech Bronze Exploration — Media & Patents

Visual Overview


Visual Overview


Purpose: quick, reproducible exploration of the Kaggle Bronze mirrors to decide what the cleaning pipeline should do next.

Incoming variables:

Patent dataset schema: - publication_number: The publication number of the patent. - application_number: The application number corresponding to the patent. - country_code: The code representing the country in which the patent was filed. - title: The title of the patent in its localized form. - abstract: A localized abstract describing the patent. - publication_date: The date on which the patent was published. - inventor: The inventor(s) of the patent. - cpc_code: Cooperative Patent Classification code(s) associated with the patent.

Media dataset schema:

  • Unnamed: 0: Index column (usually auto-generated).
  • title: The title of the media article.
  • date: The publication date of the article.
  • author: The author(s) of the article.
  • content: The main content of the article.
  • domain: The domain of the website where the article was published.
  • url: The URL of the article.

0) Setup & Path Resolution

Purpose Establish a reproducible runtime environment, resolve project-relative paths independent of launch location, and create a standardized report directory layout.

Inputs None.

Outputs / Artifacts

  • Console dump of environment (Python, pandas) and resolved directories.
  • Created reports/exploration/ and reports/exploration/figs/ for downstream CSV/PNG artifacts.

Cleaning Objective Maximize reproducibility and discoverability of artifacts produced by later steps.

Why data-driven All subsequent modules persist their diagnostics (CSV, PNG) side-by-side in these folders, enabling quick review and CI diffs.

Python
# --- Environment & imports ---
import os, sys, json, re, itertools, hashlib, urllib.parse, warnings, ast, html
from pathlib import Path
from typing import Optional, List, Any, Dict
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display

warnings.filterwarnings("ignore", category=FutureWarning)
pd.options.display.max_colwidth = 140
pd.options.display.float_format = '{:,.2f}'.format

try:
    from dotenv import load_dotenv
    load_dotenv()
except Exception:
    pass

# --- Resolve project root robustly (works from anywhere) ---
NB_DIR = Path.cwd()

if NB_DIR.name == "notebooks" and NB_DIR.parent.name == "cleaning_1":
    PROJECT_ROOT = NB_DIR.parents[1]
else:
    PROJECT_ROOT = NB_DIR
    for p in NB_DIR.parents:
        if (p / "cleantech_data").exists():
            PROJECT_ROOT = p
            break

BRONZE_KAGGLE = Path(
    os.getenv("KAGGLE_BRONZE_DIR", PROJECT_ROOT / "cleantech_data" / "bronze" / "kaggle")
).resolve()

DATA_ROOT = BRONZE_KAGGLE.parents[1]  # <project>/cleantech_data
REPO_ROOT   = DATA_ROOT.parent
REPORTS_DIR = NB_DIR / 'reports' / 'exploration'
FIGS_DIR    = REPORTS_DIR / "figs"
for d in (REPORTS_DIR, FIGS_DIR):
    d.mkdir(parents=True, exist_ok=True)

print("Python:", sys.executable)
print("PROJECT_ROOT   :", PROJECT_ROOT)
print("BRONZE_KAGGLE :", BRONZE_KAGGLE)
print("Reports ->", REPORTS_DIR)
print("Figures ->", FIGS_DIR)

# --- Controls ---
N_ROWS      = int(os.getenv("N_ROWS")) if os.getenv("N_ROWS") else None
MEDIA_SLUG  = "cleantech-media-dataset"
PATENT_SLUG = "cleantech-google-patent-dataset"

def latest_date_dir(base: Path) -> Path:
    if not base.exists():
        raise FileNotFoundError(f'Base path not found: {base}')
    # dirs named YYYY-MM-DD
    dirs = sorted([p for p in base.iterdir() if p.is_dir()], key=lambda p: p.name, reverse=True)
    if not dirs:
        raise FileNotFoundError(f'No date buckets under {base}')
    return dirs[0]

def latest_kaggle_bronze_dir(slug: str) -> Path:
    return latest_date_dir(BRONZE_KAGGLE / slug)

def load_latest_extracted(dir: Path, pattern: str, read_fn, n_rows: Optional[int], **read_kwargs) -> pd.DataFrame:
    if not dir.exists():
        raise FileNotFoundError(f"Missing extracted dir: {dir}")
    candidates = sorted(dir.glob(pattern))
    if not candidates:
        raise FileNotFoundError(f"No files matching {pattern} in {dir}")
    latest = candidates[-1]
    df = read_fn(latest, **read_kwargs)
    if n_rows is not None:
        df = df.head(n_rows)
    print(f"Using extracted file={latest.name} -> {df.shape} rows, columns")
    return df

def save_kaggle(df: pd.DataFrame, name: str) -> Path:
    base = SILVER_KAGGLE_DIR / name
    try:
        out = base.with_suffix(".parquet")
        df.to_parquet(out, index=False)
    except Exception as exc:
        out = base.with_suffix(".csv.gz")
        df.to_csv(out, index=False, compression="gzip")
        print(f"[Fallback CSV.GZ] → {out.name} rows={len(df):,} ({exc})")
    print("Kaggle Silver →", out)
    return out

# Resolve latest Bronze snapshots
media_dir       = latest_kaggle_bronze_dir(MEDIA_SLUG)
patent_dir      = latest_kaggle_bronze_dir(PATENT_SLUG)
media_ext_dir   = media_dir / "extracted"
patent_ext_dir  = patent_dir / "extracted"

KAGGLE_BUCKET = media_dir.name  # e.g., '2025-08-09' (patent_dir.name should match)
SILVER_KAGGLE_DIR = (NB_DIR / "cleantech_data" / "silver" / "kaggle" / KAGGLE_BUCKET).resolve()
SILVER_KAGGLE_DIR.mkdir(parents=True, exist_ok=True)
print("SILVER_KAGGLE_DIR :", SILVER_KAGGLE_DIR)

df_patents = load_latest_extracted(patent_ext_dir, 'CleanTech*.json', pd.read_json, N_ROWS, lines=True)
df_media   = load_latest_extracted(media_ext_dir, 'cleantech_media_dataset_v3_*.csv', pd.read_csv, N_ROWS)

print("Sample shapes -> MEDIA:", df_media.shape, " PATENTS:", df_patents.shape)
# Keep untouched copies for "pre-clean" comparisons and audits
df_media_raw   = df_media.copy(deep=True)
df_patents_raw = df_patents.copy(deep=True)
Text Only
1
2
3
4
5
6
7
8
9
Python: C:\Users\gerbe\PycharmProjects\MT\.venv\Scripts\python.exe
PROJECT_ROOT   : C:\Users\gerbe\PycharmProjects\MT
BRONZE_KAGGLE : C:\Users\gerbe\PycharmProjects\MT\cleantech_data\bronze\kaggle
Reports -> C:\Users\gerbe\PycharmProjects\MT\notebooks\reports\exploration
Figures -> C:\Users\gerbe\PycharmProjects\MT\notebooks\reports\exploration\figs
SILVER_KAGGLE_DIR : C:\Users\gerbe\PycharmProjects\MT\notebooks\cleantech_data\silver\kaggle\2025-08-09
Using extracted file=CleanTech_22-24_updated.json -> (406857, 8) rows, columns
Using extracted file=cleantech_media_dataset_v3_2024-10-28.csv -> (20111, 7) rows, columns
Sample shapes -> MEDIA: (20111, 7)  PATENTS: (406857, 8)

Run snapshot (current data)

  • MEDIA: (20,111 × 7)
  • PATENTS: (406,857 × 8)

1) Schema & Nulls

What it does Profiles each dataframe to record dtype, non-null counts, unique cardinality (hash-safe for lists/dicts/arrays), percent nulls, and two previews:

  • raw_example — first non-null value shown verbatim (quotes/brackets/np.int64(...) preserved)
  • examples — first three values, also shown raw

Inputs df_media, df_patents

Outputs - reports/exploration/media_profile_before.csv - reports/exploration/patent_profile_before.csv

Cleaning objective Surface dead columns, non-scalar cells, mixed types, and candidate keys.

Why data-driven Literal examples plus exact counts justify drops (e.g., author) and guide normalization (e.g., stringified lists).

Python
def _hashable(x):
    if x is None or (isinstance(x, float) and pd.isna(x)): return x
    if isinstance(x, dict): return tuple(sorted((k, _hashable(v)) for k, v in x.items()))
    if isinstance(x, (list, tuple, set)): return tuple(_hashable(v) for v in x)
    if hasattr(x, "tolist"): return _hashable(x.tolist())
    try:
        hash(x); return x
    except TypeError:
        return json.dumps(x, sort_keys=True, default=str)

def _safe_nunique(s: pd.Series) -> int:
    try:
        return s.nunique(dropna=True)
    except TypeError:
        return s.map(_hashable).nunique(dropna=True)

def _first_non_null(s: pd.Series):
    idx = s.first_valid_index()
    return None if idx is None else s.loc[idx]

def _raw_repr(v) -> str:
    if v is None or (isinstance(v, float) and pd.isna(v)):
        return ""
    try:
        return repr(v)   # preserves quotes/brackets/escapes
    except Exception:
        return str(v)

def profile(df: pd.DataFrame, example_rows: int = 3) -> pd.DataFrame:
    if df.empty:
        return pd.DataFrame()
    out = pd.DataFrame({
        "dtype": df.dtypes.astype(str),
        "non_null": df.notna().sum(),
        "n_unique": df.apply(_safe_nunique),
    })
    out["null_pct"] = ((1 - out["non_null"] / len(df)) * 100).round(2)

    raw_examples, triples = [], []
    for c in df.columns:
        s = df[c].dropna()
        raw_examples.append(_raw_repr(_first_non_null(s)))             # <- literal
        triples.append([_raw_repr(v) for v in s.head(example_rows)])   # optional: raw for the list too

    out["raw_example"] = raw_examples
    out["examples"] = triples
    return out

media_prof_before  = profile(df_media)
patent_prof_before = profile(df_patents)
display(media_prof_before); display(patent_prof_before)

media_prof_before.to_csv(REPORTS_DIR / 'media_profile_before.csv', index=True)
patent_prof_before.to_csv(REPORTS_DIR / 'patent_profile_before.csv', index=True)
print("Saved profiles (before) →", REPORTS_DIR)
dtype non_null n_unique null_pct raw_example examples
Unnamed: 0 int64 20111 20111 0.00 np.int64(93320) [93320, 93321, 98159]
title object 20111 20016 0.00 'XPeng Delivered ~100,000 Vehicles In 2021' ['XPeng Delivered ~100,000 Vehicles In 2021', 'Green Hydrogen: Drop In Bucket Or Big Splash?', 'World’ s largest floating PV plant goes ...
date object 20111 979 0.00 '2022-01-02' ['2022-01-02', '2022-01-02', '2022-01-03']
author float64 0 0 100.00 []
content object 20111 20068 0.00 '[\'Chinese automotive startup XPeng has shown one of the most dramatic auto production ramp-ups in history, and the good news is it onl... ['[\'Chinese automotive startup XPeng has shown one of the most dramatic auto production ramp-ups in history, and the good news is it on...
domain object 20111 25 0.00 'cleantechnica' ['cleantechnica', 'cleantechnica', 'pv-magazine']
url object 20111 20111 0.00 'https://cleantechnica.com/2022/01/02/xpeng-delivered-100000-vehicles-in-2021/' ['https://cleantechnica.com/2022/01/02/xpeng-delivered-100000-vehicles-in-2021/', 'https://cleantechnica.com/2022/01/02/its-a-green-hydr...
dtype non_null n_unique null_pct raw_example examples
publication_number object 406857 31366 0.00 'CN-117138249-A' ['CN-117138249-A', 'CN-117151396-A', 'CN-117141530-A']
application_number object 406857 28749 0.00 'CN-202311356270-A' ['CN-202311356270-A', 'CN-202311109834-A', 'CN-202310980795-A']
country_code object 406857 37 0.00 'CN' ['CN', 'CN', 'CN']
title object 406857 53357 0.00 '一种石墨烯光疗面罩' ['一种石墨烯光疗面罩', 'Distributed economic scheduling method for wind, solar, biogas and hydrogen multi-energy multi-microgrid system', '氢能源动力轨...
abstract object 406857 36108 0.00 'The application provides a graphene phototherapy mask, and relates to the technical field of phototherapy devices of graphene photother... ['The application provides a graphene phototherapy mask, and relates to the technical field of phototherapy devices of graphene photothe...
publication_date int64 406857 646 0.00 np.int64(20231201) [20231201, 20231201, 20231201]
inventor object 406857 21151 0.00 ['LI HAITAO', 'CAO WENQIANG'] [['LI HAITAO', 'CAO WENQIANG'], ['HU PENGFEI', 'LI ZIMENG'], ['XIE BO', 'ZHANG SHUIQING', 'ZHOU FEI', 'LIU YONG', 'Zhou Houyi']]
cpc_code object 406857 17741 0.00 'A61N2005/0654' ['A61N2005/0654', 'G06Q50/06', 'Y02T90/40']
Text Only
1
Saved profiles (before) → C:\Users\gerbe\PycharmProjects\MT\notebooks\reports\exploration

Notes from data

Media

  • url is unique (20,111/20,111) → solid primary key.
  • author is 100% null (dtype float64 from NaNs) → drop.
  • Unnamed: 0 is an imported index column → drop.
  • date is object with values like '2022-01-02' → parse to datetime.
  • content is a stringified list-of-strings (see raw_example beginning with '[...), not a real list → parse (e.g., ast.literal_eval) and then either flatten (join to text) or normalize to a child table.
  • title has 20,016 uniques vs 20,111 rows (≈95 duplicates) → minor reprints expected.
  • domain has only 25 uniques → a few publishers dominate.

Patents

  • Heavy replication: 406,857 rows but only 31,366 unique publication_number (\~12.97 rows per publication) and 28,749 unique application_number (\~14.15 per application).
  • Likely row explosion from many-to-many fields: inventor is a real list (see list in raw_example), and multiple cpc_code values per publication (scalar per row but repeated across rows).
  • publication_date is int64 in YYYYMMDD (e.g., np.int64(20231201)) → convert to datetime.
  • title (53,357 uniques) and abstract (36,108 uniques) vary across duplicates → multiple languages/versions present.
  • country_code has 37 uniques, as expected.

Actionable cleanups

  • Drop: author, Unnamed: 0.
  • Parse dates: media.date, patents.publication_date.
  • Normalize:

  • Media: keep one row per url; parse content from string → list; then either flatten to text or store in a separate media_content table.

  • Patents: create base publication table keyed by publication_number; junction tables for publication_cpc and publication_inventor.
  • De-dup policy (patents): group by publication_number; pick a preferred record (e.g., longest abstract or preferred language) before joining back to junctions.

2) Date Columns — Robust Parsing

What it does Detects date-like columns and parses safely.

Inputs df_media['date'], df_patents['publication_date'] (and any column with "date" in its name)

Outputs reports/exploration/media_date_parse_report.csv reports/exploration/patent_date_parse_report.csv

Cleaning objective Trustworthy timelines for plots, filters, and joins.

Why data-driven I log parse success % and min/max before downstream use.

Python
def parse_date_col(s: pd.Series, name: str) -> pd.Series:
    """
    If series is integer/8-char-like YYYYMMDD, parse with format to avoid epoch misinterpretation.
    """
    if s.dtype.kind in "iu":
        # integers; treat as YYYYMMDD
        return pd.to_datetime(s.astype("Int64").astype(str), format="%Y%m%d", errors="coerce")
    if s.dtype == object:
        # try strict YYYYMMDD first, then fallback
        dt1 = pd.to_datetime(s, format="%Y%m%d", errors="coerce")
        return pd.to_datetime(s, errors="coerce").where(dt1.isna(), dt1)
    return pd.to_datetime(s, errors="coerce")

def date_report(df: pd.DataFrame, cols: List[str]) -> pd.DataFrame:
    rep = {}
    for c in cols:
        dt = parse_date_col(df[c], c)
        rep[c] = {
            "non_null": int(dt.notna().sum()),
            "min": str(dt.min()) if dt.notna().any() else None,
            "max": str(dt.max()) if dt.notna().any() else None,
            "parse_success_pct": float(100*dt.notna().mean())
        }
    return pd.DataFrame(rep).T

media_date_cols  = [c for c in df_media.columns if "date" in c.lower()]
patent_date_cols = [c for c in df_patents.columns if "date" in c.lower()]

media_date_report  = date_report(df_media, media_date_cols) if media_date_cols else pd.DataFrame()
patent_date_report = date_report(df_patents, patent_date_cols) if patent_date_cols else pd.DataFrame()

display(media_date_report)
display(patent_date_report)

media_date_report.to_csv(REPORTS_DIR / "media_date_parse_report.csv")
patent_date_report.to_csv(REPORTS_DIR / "patent_date_parse_report.csv")
print("Saved date parse reports.")
non_null min max parse_success_pct
date 20111 2022-01-02 00:00:00 2024-10-24 00:00:00 100.00
non_null min max parse_success_pct
publication_date 406857 2022-01-01 00:00:00 2024-09-05 00:00:00 100.00
Text Only
1
Saved date parse reports.

Data Notes

  • Media date: 100% parse, min 2022‑01‑02, max 2024‑10‑24.

  • Patents publication_date: 100% parse, min 2022‑01‑01, max 2024‑09‑05.

3) Text Flattening & Cleaning (Safe)

Description Turn lists/JSON‑ish strings into text, remove HTML/URLs/boilerplate, normalize whitespace.

Input MEDIA.title, MEDIA.content, MEDIA.url; PATENTS.title, PATENTS.abstract.

Output *_clean columns, character/word counts.

Cleaning Objective Retain actual article/patent text while scrubbing obvious junk; avoid over‑stripping.

Why data‑driven We compare raw vs clean length distributions and keep per‑row counts.

Python
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
# --- required imports & light fallbacks ---
from typing import Optional, Any
import re, html, ast, urllib.parse, collections, warnings
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display  # for notebook-friendly tables

# optional: where to save figs (reuse global FIGS_DIR from the setup cell)
FIGS_DIR.mkdir(parents=True, exist_ok=True)

# --- small config toggles ---
THIN_WORD_THRESHOLD = 120   # cheap quality flag for very short articles

# --- URL/domain helpers ---
def canonicalize_url(url: str) -> str:
    if not isinstance(url, str) or not url:
        return url
    try:
        u = urllib.parse.urlsplit(url)
        q = urllib.parse.parse_qsl(u.query, keep_blank_values=True)
        # keep only non-tracking params
        q = [(k, v) for (k, v) in q if not re.match(r"^(utm_|fbclid$|gclid$|mc_cid$|mc_eid$)", k)]
        new_query = urllib.parse.urlencode(q)
        return urllib.parse.urlunsplit((u.scheme, u.netloc.lower(), u.path, new_query, ""))  # strip fragment
    except Exception:
        return url

def extract_domain(url: str) -> Optional[str]:
    if not isinstance(url, str):
        return None
    m = re.search(r"://([^/]+)", url)
    return m.group(1).lower() if m else None

_LISTING_PATH_RE = re.compile(
    r"/(?:page/\d+|author/|category/|tag/|tags?/|topics?/|search/?)",
    re.I,
)
def is_listing(url: str) -> bool:
    try:
        p = urllib.parse.urlsplit(str(url)).path
        return bool(_LISTING_PATH_RE.search(p))
    except Exception:
        return False

# --- Text normalization (gentle) ---

# Only strip *real* tags (letter right after '<'); protect "< 500 µm"
SAFE_TAG_RE = re.compile(r"<\s*/?\s*[A-Za-z][^>]*>")

# Targeted banners to remove ONLY if they appear at the very start
TEXT_PREFIXES = [
    "By clicking `` Allow All '' you agree to the storing of cookies",
    "We use cookies to enhance your experience.",
    "We use cookies to enhance your experience",
    "Sign in to get the best natural gas news and data.",
    "Create a free IEA account to download our reports",
]

REGEX_PREFIXES = [
    r"^window\.dojoRequire\s*\(\s*\[\s*[`'\"\s]*mojo/signup-forms/Loader.*?\)\s*\}\)\s*",
]

# Common tail junk (domain independent) – only applied in the last ~40 lines
GENERIC_TAIL_PHRASES = [
    "No comments were found for",
    "Energy XPRT is a global marketplace",
    "All Rights Reserved. Terms Privacy",
    "All Rights Reserved. Terms",
    "Terms Privacy",
    "Need help finding the right suppliers? Try XPRT Sourcing.",
    "Upcoming conferences organized by SGO:",
    "This site uses Akismet to reduce spam.",
    "Learn how your comment data is processed.",
]
GENERIC_TAIL_RE = re.compile("|".join([re.escape(p) for p in GENERIC_TAIL_PHRASES]), re.I)

# Domain-specific tail droppers (lightweight)
DOMAIN_TAIL_DROPS = {
    "www.energy-xprt.com": [
        r"^No comments were found",
        r"^Energy XPRT is a global marketplace",
        r"All Rights Reserved\.",
        r"^\s*Terms\s+Privacy\s*$",
        r"^Need help finding the right suppliers\?",
        r"^--\s*--\s*--",
        r"^Source:\s",
    ],
    "www.azocleantech.com": [
        r"^Disclaimer:",
        r"\bRetrieved on\b",
        r"\bviewed\b",
        r"\( accessed\b",
    ],
}

# Protect technical lines (don't drop if they match any of these signals)
UNIT_TOKENS = [
    "mw","gw","kwh","mwh","gwh","°c","°f","µm","mm","cm","km","bar","psi","ppm","ppb",
    "mmbtu","kv","ma","kw","tpa","nm","kg","t","ton","lbs",
    "co2","h2","rng","egs","csp","pv","ev","lithium","li-ion","lifepo4"
]

def _maybe_list_to_str(x: Any) -> str:
    if isinstance(x, (list, tuple)):
        return " ".join(map(str, x))
    if isinstance(x, str) and len(x) >= 2 and x.strip().startswith("[") and x.strip().endswith("]"):
        try:
            y = ast.literal_eval(x)
            if isinstance(y, (list, tuple)):
                return " ".join(map(str, y))
        except Exception:
            pass
    return str(x)

# mask/unmask angle-bracketed numbers/units to survive HTML parsing
# --- FIX: only mask "<" when it's clearly a numeric comparator, do NOT touch '>' (was breaking <sub> etc.)
_ANGLE_OPEN = "⟨"

def _mask_numeric_angles(s: str) -> str:
    # mask patterns like "< 500", "<-10", "<+3" that are NOT HTML tags
    return re.sub(r"(?<![A-Za-z])<\s*(?=[\d\+\-])", _ANGLE_OPEN, s)

def _unmask_numeric_angles(s: str) -> str:
    # restore masked comparator "< "
    return s.replace(_ANGLE_OPEN, "< ")

def strip_html_safely(s: str) -> str:
    if not isinstance(s, str) or "<" not in s:
        return s
    has_real_tags = bool(SAFE_TAG_RE.search(s))
    if not has_real_tags:
        return s
    try:
        from bs4 import BeautifulSoup  # type: ignore
        s2 = _mask_numeric_angles(s)
        soup = BeautifulSoup(s2, "lxml")
        for tag in soup(["script", "style", "noscript"]):
            tag.decompose()
        text = soup.get_text("\n")
        return _unmask_numeric_angles(text)
    except Exception:
        warnings.warn("BeautifulSoup/lxml unavailable, using regex-based tag strip.", RuntimeWarning)
        return SAFE_TAG_RE.sub(" ", s)

def _strip_known_prefix_artifacts(s: str) -> str:
    if not isinstance(s, str) or not s:
        return s
    t = s.lstrip()
    for p in TEXT_PREFIXES:
        if t.startswith(p):
            t = t[len(p):].lstrip()
    for rx in REGEX_PREFIXES:
        t = re.sub(rx, "", t, flags=re.I | re.S)
    return t

def _is_techy_line(ln: str) -> bool:
    if not ln or len(ln) < 8:
        return False
    has_num = any(ch.isdigit() for ch in ln)
    unit_hit = any(re.search(rf"\b{re.escape(u)}\b", ln, flags=re.I) for u in UNIT_TOKENS)
    symbols = any(sym in ln for sym in ["%", "°", "±"])
    return (has_num and (unit_hit or symbols))

def _dedupe_paragraphs(s: str) -> str:
    paras = [p.strip() for p in re.split(r"(?:\n\s*\n)+", s) if p.strip()]
    seen, out = set(), []
    for p in paras:
        key = re.sub(r"\s+", " ", p).strip().lower()
        if key not in seen:
            seen.add(key)
            out.append(p)
    return "\n\n".join(out)

def _trim_azo_citations(s: str) -> str:
    lines = s.splitlines()
    out, seen_cite = [], False
    for ln in lines:
        is_cite_line = "AZoCleantech" in ln or "AZoNetwork" in ln
        looks_retrieved = re.search(r"\b(Retrieved on|viewed|accessed)\b", ln, re.I)
        if is_cite_line and not looks_retrieved and not seen_cite:
            out.append(ln); seen_cite = True
        elif looks_retrieved:
            continue
        else:
            out.append(ln)
    return "\n".join(out)

def _drop_tail_junk(lines: list[str], domain: Optional[str]) -> list[str]:
    n = len(lines)
    tail_start = max(0, n - 40)
    out = []
    skip_block = False

    domain_res = []
    if domain and domain in DOMAIN_TAIL_DROPS:
        domain_res = [re.compile(rx, flags=re.I) for rx in DOMAIN_TAIL_DROPS[domain]]

    for i, ln in enumerate(lines):
        ln_stripped = ln.strip()
        if not ln_stripped:
            if skip_block:
                skip_block = False
            out.append(ln)
            continue

        # Domain-specific multi-line junk blocks (e.g., Energy-XPRT sourcing prompt)
        if i >= tail_start and re.search(r"^Need help finding the right suppliers\?", ln, re.I):
            skip_block = True
            continue
        if skip_block:
            continue

        # Tail-only generic/domain junk unless the line looks technical
        if i >= tail_start and (GENERIC_TAIL_RE.search(ln) or any(rx.search(ln) for rx in domain_res)):
            if not _is_techy_line(ln):
                continue

        # SGO conference blocks
        if i >= tail_start and re.search(r"^Upcoming conferences organized by SGO:", ln, re.I):
            continue

        out.append(ln)
    return out

def _minimal_pass(x: Any) -> str:
    s = _maybe_list_to_str(x)
    s = html.unescape(s)
    s = strip_html_safely(s)
    s = _strip_known_prefix_artifacts(s)
    s = _dedupe_paragraphs(s)
    s = re.sub(r"[ \t]+", " ", s)
    s = re.sub(r"\s*\n\s*", "\n", s).strip()
    return s

def build_stoplines(df: pd.DataFrame, text_col: str, domain_col: str, min_len: int = 40,
                    max_len: int = 240, min_freq: int = 10) -> dict[str, set[str]]:
    """
    Learn high-frequency lines per domain to drop later (footers/repeaters).
    """
    counts_by_domain = collections.defaultdict(lambda: collections.Counter())
    for _, row in df[[text_col, domain_col]].dropna(subset=[text_col]).iterrows():
        dom = (row.get(domain_col) or "").lower()
        for ln in str(row[text_col]).splitlines():
            ln_norm = re.sub(r"\s+", " ", ln.strip()).lower()
            if min_len <= len(ln_norm) <= max_len:
                counts_by_domain[dom][ln_norm] += 1
    stop = {}
    for dom, ctr in counts_by_domain.items():
        keep = {l for l, c in ctr.items() if c >= min_freq}
        stop[dom] = keep
    # also a generic bucket
    all_ctr = collections.Counter()
    for ctr in counts_by_domain.values():
        all_ctr.update(ctr)
    stop["*"] = {l for l, c in all_ctr.items() if c >= min_freq}
    return stop

def clean_text(x: Any, domain: Optional[str] = None, stoplines: Optional[dict] = None,
               mode: str = "lo") -> str:
    raw = _maybe_list_to_str(x)
    if not isinstance(raw, str) or not raw.strip():
        return ""

    s = html.unescape(raw)
    s = strip_html_safely(s)
    s = _strip_known_prefix_artifacts(s)

    # Optional domain frequency-based drop (safe—done before paragraph de-dupe)
    lines = s.splitlines()
    if stoplines:
        dom_key = (domain or "").lower()
        dom_stop = stoplines.get(dom_key, set())
        gen_stop = stoplines.get("*", set())
        kept = []
        for ln in lines:
            ln_norm = re.sub(r"\s+", " ", ln.strip()).lower()
            if (ln_norm in dom_stop or ln_norm in gen_stop) and not _is_techy_line(ln):
                continue
            kept.append(ln)
        lines = kept

    # Tail cleanup (light)
    lines = _drop_tail_junk(lines, domain=(domain or "").lower())
    s = "\n".join(lines)

    if mode in ("lo", "med"):
        # Gentle AZo citation thinning
        if domain and "azocleantech.com" in domain:
            s = _trim_azo_citations(s)

    # Exact paragraph de-duplication
    s = _dedupe_paragraphs(s)

    # Whitespace normalization but keep paragraphs
    s = re.sub(r"[ \t]+", " ", s)
    s = re.sub(r"\s*\n\s*", "\n", s).strip()

    # Fail-open safeguard: if we removed too much, revert to minimal
    raw_len = len(str(raw))
    if raw_len > 400 and len(s) < 0.35 * raw_len:
        s = _minimal_pass(raw)

    return s

def parse_date_col(s: pd.Series, colname: str = "date", dayfirst: bool = False) -> pd.Series:
    try:
        ss = pd.to_numeric(s, errors="coerce")
        is_epoch = ss.notna() & s.astype(str).str.fullmatch(r"\d{10,13}")
        out = pd.to_datetime(s, errors="coerce", utc=False, dayfirst=dayfirst)
        if is_epoch.any():
            # 13-digit => ms
            ms_mask = ss.astype("Int64").astype(str).str.len() == 13
            out.loc[is_epoch & ms_mask] = pd.to_datetime(ss[is_epoch & ms_mask], unit="ms", errors="coerce")
            out.loc[is_epoch & ~ms_mask] = pd.to_datetime(ss[is_epoch & ~ms_mask], unit="s", errors="coerce")
        return out
    except Exception:
        return pd.to_datetime(s, errors="coerce", utc=False, dayfirst=dayfirst)

# --- plotting helpers (overlay) ---
# FIX: use common bin edges for fair overlays
def overlay_hist(a, b, label_a, label_b, title, xlabel, bins=60, fname=None):
    a = pd.Series(a).dropna()
    b = pd.Series(b).dropna()
    both = pd.concat([a, b], ignore_index=True)
    if both.empty:
        print(f"[{title}] no data to plot"); return
    edges = np.histogram_bin_edges(both.values, bins=bins)
    plt.figure(figsize=(8,4))
    plt.hist(a, bins=edges, alpha=0.5, label=label_a)
    plt.hist(b, bins=edges, alpha=0.5, label=label_b)
    plt.title(title)
    plt.xlabel(xlabel); plt.ylabel("count")
    plt.legend()
    plt.tight_layout()
    if fname: plt.savefig(FIGS_DIR / fname, dpi=120)
    plt.show()

# --- “heavy removals” helper (generic) ---
def top_heavy_removals(
    df: pd.DataFrame,
    k: int = 15,
    min_raw: int = 200,
    raw_col: str = "content_rawtext",
    clean_col: str = "content_clean",
    extra_cols: list[str] | None = None,
):
    if raw_col not in df.columns or clean_col not in df.columns:
        missing = {c for c in [raw_col, clean_col] if c not in df.columns}
        raise KeyError(f"Missing required column(s): {missing}")

    tmp = df.copy()
    tmp["raw_len"]   = tmp[raw_col].fillna("").astype(str).str.len()
    tmp["clean_len"] = tmp[clean_col].fillna("").astype(str).str.len()
    tmp = tmp[tmp["raw_len"] >= min_raw].copy()
    tmp["removed_pct"] = np.where(tmp["raw_len"] > 0, 1 - (tmp["clean_len"] / tmp["raw_len"]), 0.0)

    base_cols = [raw_col, clean_col, "raw_len", "clean_len", "removed_pct"]
    extras = [c for c in (extra_cols or []) if c in tmp.columns]
    cols = list(dict.fromkeys(extras + base_cols))  # unique, preserve order

    return tmp.sort_values("removed_pct", ascending=False).head(k)[cols]

# =========================
# === Apply to MEDIA ======
# =========================
df_media = df_media.copy()
if "Unnamed: 0" in df_media.columns:
    df_media = df_media.drop(columns=["Unnamed: 0"])

if "url" in df_media.columns:
    df_media["url_clean"] = df_media["url"].map(canonicalize_url)
    df_media["domain"]    = df_media["url_clean"].map(extract_domain)
    # Flag listing/archive pages (do NOT drop) - keep for later filtering in pipeline
    df_media["is_listing"] = df_media["url_clean"].map(is_listing)
    print(f"Listing/archive pages flagged: {int(df_media['is_listing'].sum())} / {len(df_media)}")

# Learn high-frequency stoplines from your corpus (only if content column exists)
stoplines_by_domain = {}
if "content" in df_media.columns:
    stoplines_by_domain = build_stoplines(
        df_media, text_col="content", domain_col="domain",
        min_len=40, max_len=240, min_freq=10
    )

if "title" in df_media.columns:
    df_media["title_clean"] = df_media["title"].map(_minimal_pass)
    df_media["title_char_count"]       = df_media["title"].astype(str).str.len()
    df_media["title_clean_char_count"] = df_media["title_clean"].astype(str).str.len()

if "content" in df_media.columns:
    df_media["content_rawtext"] = df_media["content"].map(_maybe_list_to_str)
    df_media["content_clean"] = df_media.apply(
        lambda r: clean_text(
            r.get("content_rawtext"),
            r.get("domain"),
            stoplines=stoplines_by_domain,
            mode="lo"
        ),
        axis=1
    )
    # length features (raw + clean)
    df_media["content_raw_char_count"]  = df_media["content_rawtext"].astype(str).str.len()
    df_media["content_raw_word_count"]  = df_media["content_rawtext"].astype(str).str.split().map(len, na_action="ignore")
    df_media["content_char_count"]      = df_media["content_clean"].astype(str).str.len()
    df_media["content_word_count"]      = df_media["content_clean"].astype(str).str.split().map(len, na_action="ignore")
    df_media["is_thin_content"]         = df_media["content_word_count"].fillna(0).lt(THIN_WORD_THRESHOLD)

if "date" in df_media.columns:
    df_media["published_dt"] = parse_date_col(df_media["date"], "date")

print("Media shape after text pass:", df_media.shape)

# =========================
# === Apply to PATENTS ====
# =========================
df_patents = df_patents.copy()

if "title" in df_patents.columns:
    df_patents["title_clean"] = df_patents["title"].map(_minimal_pass)
    df_patents["title_char_count"]       = df_patents["title"].astype(str).str.len()
    df_patents["title_clean_char_count"] = df_patents["title_clean"].astype(str).str.len()

if "abstract" in df_patents.columns:
    # NOTE: minimal pass here to keep as much content as possible; subscripts like Fe2O3 are preserved (fixed).
    df_patents["abstract_clean"]          = df_patents["abstract"].map(_minimal_pass)
    df_patents["abstract_raw_char_count"] = df_patents["abstract"].astype(str).str.len()
    df_patents["abstract_raw_word_count"] = df_patents["abstract"].astype(str).str.split().map(len, na_action="ignore")
    df_patents["abstract_char_count"]     = df_patents["abstract_clean"].astype(str).str.len()
    df_patents["abstract_word_count"]     = df_patents["abstract_clean"].astype(str).str.split().map(len, na_action="ignore")

if "publication_date" in df_patents.columns:
    df_patents["publication_date_dt"] = parse_date_col(df_patents["publication_date"], "publication_date")

if "country_code" in df_patents.columns:
    df_patents["country_code"] = df_patents["country_code"].astype(str).str.upper()

print("Patents shape after text pass:", df_patents.shape)
Text Only
1
2
3
Listing/archive pages flagged: 527 / 20111
Media shape after text pass: (20111, 19)
Patents shape after text pass: (406857, 17)
Python
# =========================
# === Diagnostics =========
# =========================
# Robust emptiness metrics
empty_media = (df_media.get("content_clean", pd.Series(index=df_media.index)).fillna("").astype(str).str.len() == 0).mean()*100
empty_pat   = (df_patents.get("abstract_clean", pd.Series(index=df_patents.index)).fillna("").astype(str).str.len() == 0).mean()*100
print(f"Empty content after clean — media: {empty_media:.2f}%, patents: {empty_pat:.2f}%")

# --- overlay plots: MEDIA (raw vs clean) ---
if "content_raw_char_count" in df_media.columns and "content_char_count" in df_media.columns:
    overlay_hist(df_media["content_raw_char_count"], df_media["content_char_count"],
                 "raw content", "clean content",
                 "MEDIA: Content char length (raw vs clean)", "chars",
                 bins=60, fname="media_overlay_content_chars.png")
if "content_raw_word_count" in df_media.columns and "content_word_count" in df_media.columns:
    overlay_hist(df_media["content_raw_word_count"], df_media["content_word_count"],
                 "raw content", "clean content",
                 "MEDIA: Content word count (raw vs clean)", "words",
                 bins=60, fname="media_overlay_content_words.png")
if "title_char_count" in df_media.columns and "title_clean_char_count" in df_media.columns:
    overlay_hist(df_media["title_char_count"], df_media["title_clean_char_count"],
                 "raw title", "clean title",
                 "MEDIA: Title char length (raw vs clean)", "chars",
                 bins=60, fname="media_overlay_title_chars.png")

# --- overlay plots: PATENTS (raw vs clean) ---
if "abstract_raw_char_count" in df_patents.columns and "abstract_char_count" in df_patents.columns:
    overlay_hist(df_patents["abstract_raw_char_count"], df_patents["abstract_char_count"],
                 "raw abstract", "clean abstract",
                 "PATENTS: Abstract char length (raw vs clean)", "chars",
                 bins=60, fname="patents_overlay_abstract_chars.png")
if "abstract_raw_word_count" in df_patents.columns and "abstract_word_count" in df_patents.columns:
    overlay_hist(df_patents["abstract_raw_word_count"], df_patents["abstract_word_count"],
                 "raw abstract", "clean abstract",
                 "PATENTS: Abstract word count (raw vs clean)", "words",
                 bins=60, fname="patents_overlay_abstract_words.png")
if "title_char_count" in df_patents.columns and "title_clean_char_count" in df_patents.columns:
    overlay_hist(df_patents["title_char_count"], df_patents["title_clean_char_count"],
                 "raw title", "clean title",
                 "PATENTS: Title char length (raw vs clean)", "chars",
                 bins=60, fname="patents_overlay_title_chars.png")
Text Only
1
Empty content after clean — media: 0.00%, patents: 0.00%

png

png

png

png

png

png

Python
# --- examples: titles where char length changed (raw vs clean) ---
def title_change_examples(df: pd.DataFrame, label: str, n: int = 12) -> None:
    """
    Show rows where title length changed after cleaning.
    Displays top absolute changes and a random sample as DataFrames.
    """
    # Ensure char count cols exist (compute if missing)
    if "title_char_count" not in df.columns and "title" in df.columns:
        df["title_char_count"] = df["title"].astype(str).str.len()
    if "title_clean_char_count" not in df.columns and "title_clean" in df.columns:
        df["title_clean_char_count"] = df["title_clean"].astype(str).str.len()

    required = {"title", "title_clean", "title_char_count", "title_clean_char_count"}
    missing = required - set(df.columns)
    if missing:
        print(f"[{label}] skipped: missing columns {missing}")
        return

    tmp = df.assign(delta=df["title_clean_char_count"] - df["title_char_count"])
    tmp = tmp[tmp["delta"] != 0]

    total = len(df)
    changed = len(tmp)
    print(f"[{label}] titles changed: {changed}/{total} ({changed/total:.1%})")
    if changed == 0:
        return

    cols = [c for c in ["url", "domain", "title", "title_clean",
                        "title_char_count", "title_clean_char_count", "delta"]
            if c in tmp.columns]

    # Top absolute changes
    top = tmp.reindex(tmp["delta"].abs().sort_values(ascending=False).index).head(n)
    print("\nTop by absolute change:")
    with pd.option_context("display.max_colwidth", 160):
        display(top[cols])

    # Random sample
    rs = tmp.sample(min(n, changed), random_state=42)
    print("\nRandom sample:")
    with pd.option_context("display.max_colwidth", 160):
        display(rs[cols])

# Run for MEDIA and PATENTS
title_change_examples(df_media, "MEDIA", n=10)
title_change_examples(df_patents, "PATENTS", n=10)

# --- heavy removals tables (MEDIA) ---
heavy_media = top_heavy_removals(
    df_media, k=15, min_raw=200,
    raw_col="content_rawtext", clean_col="content_clean",
    extra_cols=["url","domain","title","is_listing"]
)

print("\nTop heavy removals — MEDIA:")
media_cols = [c for c in [
    "url","domain","title","is_listing","raw_len","clean_len",
    "removed_pct","content_rawtext","content_clean"
] if c in heavy_media.columns]

with pd.option_context("display.max_colwidth", 160):
    display(heavy_media[media_cols])


# --- heavy removals tables (PATENTS) ---
heavy_patents = top_heavy_removals(
    df_patents, k=15, min_raw=200,
    raw_col="abstract", clean_col="abstract_clean",
    extra_cols=["title","country_code","publication_date"]
)

print("\nTop heavy removals — PATENTS:")
patent_cols = [c for c in [
    "title","country_code","publication_date","raw_len","clean_len",
    "removed_pct","abstract","abstract_clean"
] if c in heavy_patents.columns]

with pd.option_context("display.max_colwidth", 160):
    display(heavy_patents[patent_cols])
Text Only
1
2
3
4
[MEDIA] titles changed: 0/20111 (0.0%)
[PATENTS] titles changed: 14117/406857 (3.5%)

Top by absolute change:
title title_clean title_char_count title_clean_char_count delta
77089 &lt;u style=&#34;single&#34;&gt;NEUE ZWEIKOMPONENTEN-BESCHICHTUNGSSYSTEME ENTHALTEND POLYASPARAGINSÄUREESTER NEUE ZWEIKOMPONENTEN-BESCHICHTUNGSSYSTEME ENTHALTEND POLYASPARAGINSÄUREESTER 108 76 -32
77196 &lt;u style=&#34;single&#34;&gt;NEUE ZWEIKOMPONENTEN-BESCHICHTUNGSSYSTEME ENTHALTEND POLYASPARAGINSÄUREESTER NEUE ZWEIKOMPONENTEN-BESCHICHTUNGSSYSTEME ENTHALTEND POLYASPARAGINSÄUREESTER 108 76 -32
77423 &lt;u style=&#34;single&#34;&gt;NEUE ZWEIKOMPONENTEN-BESCHICHTUNGSSYSTEME ENTHALTEND POLYASPARAGINSÄUREESTER NEUE ZWEIKOMPONENTEN-BESCHICHTUNGSSYSTEME ENTHALTEND POLYASPARAGINSÄUREESTER 108 76 -32
77494 &lt;u style=&#34;single&#34;&gt;NEUE ZWEIKOMPONENTEN-BESCHICHTUNGSSYSTEME ENTHALTEND POLYASPARAGINSÄUREESTER NEUE ZWEIKOMPONENTEN-BESCHICHTUNGSSYSTEME ENTHALTEND POLYASPARAGINSÄUREESTER 108 76 -32
40906 &lt;u style=&#34;single&#34;&gt;NEUE ZWEIKOMPONENTEN-BESCHICHTUNGSSYSTEME ENTHALTEND POLYASPARAGINSÄUREESTER NEUE ZWEIKOMPONENTEN-BESCHICHTUNGSSYSTEME ENTHALTEND POLYASPARAGINSÄUREESTER 108 76 -32
77219 &lt;u style=&#34;single&#34;&gt;NEUE ZWEIKOMPONENTEN-BESCHICHTUNGSSYSTEME ENTHALTEND POLYASPARAGINSÄUREESTER NEUE ZWEIKOMPONENTEN-BESCHICHTUNGSSYSTEME ENTHALTEND POLYASPARAGINSÄUREESTER 108 76 -32
77502 &lt;u style=&#34;single&#34;&gt;NEUE ZWEIKOMPONENTEN-BESCHICHTUNGSSYSTEME ENTHALTEND POLYASPARAGINSÄUREESTER NEUE ZWEIKOMPONENTEN-BESCHICHTUNGSSYSTEME ENTHALTEND POLYASPARAGINSÄUREESTER 108 76 -32
40807 &lt;u style=&#34;single&#34;&gt;NEUE ZWEIKOMPONENTEN-BESCHICHTUNGSSYSTEME ENTHALTEND POLYASPARAGINSÄUREESTER NEUE ZWEIKOMPONENTEN-BESCHICHTUNGSSYSTEME ENTHALTEND POLYASPARAGINSÄUREESTER 108 76 -32
40916 &lt;u style=&#34;single&#34;&gt;NEUE ZWEIKOMPONENTEN-BESCHICHTUNGSSYSTEME ENTHALTEND POLYASPARAGINSÄUREESTER NEUE ZWEIKOMPONENTEN-BESCHICHTUNGSSYSTEME ENTHALTEND POLYASPARAGINSÄUREESTER 108 76 -32
77197 &lt;u style=&#34;single&#34;&gt;NEUE ZWEIKOMPONENTEN-BESCHICHTUNGSSYSTEME ENTHALTEND POLYASPARAGINSÄUREESTER NEUE ZWEIKOMPONENTEN-BESCHICHTUNGSSYSTEME ENTHALTEND POLYASPARAGINSÄUREESTER 108 76 -32
Text Only
1
Random sample:
title title_clean title_char_count title_clean_char_count delta
377597 Procédé de biosynthèse d&#39;amidon Procédé de biosynthèse d'amidon 35 31 -4
337706 Procédés et appareils comprenant un système de gestion d&#39;énergie Procédés et appareils comprenant un système de gestion d'énergie 68 64 -4
272967 Wind energy recycling&#39;s machine of sweeping floor Wind energy recycling's machine of sweeping floor 53 49 -4
291797 Dispositif d&#39;utilisation d&#39;énergie solaire Dispositif d'utilisation d'énergie solaire 50 42 -8
37705 Appareil de production d&#39;énergie éolienne Appareil de production d'énergie éolienne 45 41 -4
83701 Véhicule de régulation d&#39;eau photoélectrique multifonctionnel auto-alimenté Véhicule de régulation d'eau photoélectrique multifonctionnel auto-alimenté 79 75 -4
168314 Systeme d&#39;energie solaire Systeme d'energie solaire 29 25 -4
72978 Système de collecte d&#39;énergie Système de collecte d'énergie 33 29 -4
307397 Procede - notamment procede de stockage d&#39;energie - de fourniture d&#39;energie a proximite du point de consommation utilisant des sources d&#39;energie... Procede - notamment procede de stockage d'energie - de fourniture d'energie a proximite du point de consommation utilisant des sources d'energie regenerativ... 193 181 -12
64316 Générateur d&#39;esp Générateur d'esp 20 16 -4
Text Only
1
Top heavy removals — MEDIA:
url domain title is_listing raw_len clean_len removed_pct content_rawtext content_clean
3905 https://www.greenprophet.com/2022/11/what-is-digital-sustainability/ www.greenprophet.com What is digital sustainability? False 2181 1945 0.11 window.dojoRequire ( [ `` mojo/signup-forms/Loader '' ], function ( L) { L.start ( { `` baseUrl '': '' mc.us4.list-manage.com '', '' uuid '': '' 2a6df7ce0f3... Farmed Here builds high-tech hydroponic farms run by software. Digital sustainability helps make food, fuel and the home more sustainable Digital sustainabi...
1018 https://www.greenprophet.com/page/20/ www.greenprophet.com Green Prophet - Page 20 of 675 - Sustainability news for the Middle East True 2380 2144 0.10 window.dojoRequire ( [ `` mojo/signup-forms/Loader '' ], function ( L) { L.start ( { `` baseUrl '': '' mc.us4.list-manage.com '', '' uuid '': '' 2a6df7ce0f3... These mysterious waves move in the opposite direction to the sun's rotation, which is to the right, three times faster than what is allowed by hydrodynamics...
4332 https://www.greenprophet.com/2022/12/dubai-20-minute-city/ www.greenprophet.com Let's celebrate Dubai's 20-minute city False 2636 2400 0.09 window.dojoRequire ( [ `` mojo/signup-forms/Loader '' ], function ( L) { L.start ( { `` baseUrl '': '' mc.us4.list-manage.com '', '' uuid '': '' 2a6df7ce0f3... Sheikh Mohammed bin Rashid visited Bustanica, an urban hydroponics farm in Dubai. He plans that 80% of his city will be available to everyone within a 20-mi...
868 https://www.greenprophet.com/2022/03/irans-solar-panels/ www.greenprophet.com Iran's going solar with half a million solar panels False 2645 2409 0.09 window.dojoRequire ( [ `` mojo/signup-forms/Loader '' ], function ( L) { L.start ( { `` baseUrl '': '' mc.us4.list-manage.com '', '' uuid '': '' 2a6df7ce0f3... Each house will generate more than 100% of its energy needs and the rest will feed back to the household as a monthly check. Even oil rich countries underst...
5138 https://www.greenprophet.com/author/bhok/ www.greenprophet.com Bhok Thompson, Author at Green Prophet True 2854 2618 0.08 window.dojoRequire ( [ `` mojo/signup-forms/Loader '' ], function ( L) { L.start ( { `` baseUrl '': '' mc.us4.list-manage.com '', '' uuid '': '' 2a6df7ce0f3... You can do a sustainable beer tour in Amsterdam, by bike. Moving houses is stressful and exciting because you’ re happy that you are moving into a new place...
4854 https://www.greenprophet.com/2023/01/private-proxies-seo/ www.greenprophet.com SEO and private proxies: advantages False 2926 2690 0.08 window.dojoRequire ( [ `` mojo/signup-forms/Loader '' ], function ( L) { L.start ( { `` baseUrl '': '' mc.us4.list-manage.com '', '' uuid '': '' 2a6df7ce0f3... Solar energy companies, wind, hydrogen, green fashion… everyone needs SEO to be noticed in the search engines and also guard data ethically, and securely. H...
188 https://www.greenprophet.com/2022/01/wind-energy-azerbaijan/ www.greenprophet.com Offshore wind energy deal between Saudi Arabia and Azerbaijan False 3176 2940 0.07 window.dojoRequire ( [ `` mojo/signup-forms/Loader '' ], function ( L) { L.start ( { `` baseUrl '': '' mc.us4.list-manage.com '', '' uuid '': '' 2a6df7ce0f3... The project will contribute an estimated 3.7 percent of energy to Azerbaijan’ s total national grid capacity, powering 300,000 households The Azerbaijani Mi...
5019 https://www.greenprophet.com/2023/01/cdn-service-provider/ www.greenprophet.com Choosing the Ideal CDN Service Provider False 3301 3065 0.07 window.dojoRequire ( [ `` mojo/signup-forms/Loader '' ], function ( L) { L.start ( { `` baseUrl '': '' mc.us4.list-manage.com '', '' uuid '': '' 2a6df7ce0f3... Solar energy companies manage data just as much as tech startups do. How can you keep your data secure and available. As a business owner or manager, you ne...
3188 https://www.greenprophet.com/2022/09/elon-musk-starlink-iran/ www.greenprophet.com Elon Musk wants to Starlink Iranian dissidents False 3344 3108 0.07 window.dojoRequire ( [ `` mojo/signup-forms/Loader '' ], function ( L) { L.start ( { `` baseUrl '': '' mc.us4.list-manage.com '', '' uuid '': '' 2a6df7ce0f3... Ever find yourself in the position where you are caught in a rat trap? That’ s how the average Iranian feels under a conservative and brutal regime that for...
1855 https://www.greenprophet.com/2022/06/hemp-solar-house/ www.greenprophet.com Hemp solar house highlights vernacular building potential in Morocco False 3397 3161 0.07 window.dojoRequire ( [ `` mojo/signup-forms/Loader '' ], function ( L) { L.start ( { `` baseUrl '': '' mc.us4.list-manage.com '', '' uuid '': '' 2a6df7ce0f3... The Sunimplant team developed a concept for the preservation of the environment and cultural heritage in the rural region of the High Rif in north of Morocc...
4829 https://www.greenprophet.com/2023/01/manage-cattle-records-for-better-cattle-management/ www.greenprophet.com How to Manage Cattle Records For Better Cattle Management False 3616 3380 0.07 window.dojoRequire ( [ `` mojo/signup-forms/Loader '' ], function ( L) { L.start ( { `` baseUrl '': '' mc.us4.list-manage.com '', '' uuid '': '' 2a6df7ce0f3... Whatever kind of calving business you run or manage, software can run the farm more sustainably for camels or cows. Having an effective cattle management sy...
4333 https://www.greenprophet.com/2022/12/sustainable-sea-circus/ www.greenprophet.com How to join the sea circus False 3654 3418 0.06 window.dojoRequire ( [ `` mojo/signup-forms/Loader '' ], function ( L) { L.start ( { `` baseUrl '': '' mc.us4.list-manage.com '', '' uuid '': '' 2a6df7ce0f3... Hey ma, I’ ve run away and joined a sustainable sea circus. Dreams evolve with the times: 50 or 100 years ago, running away to join the circus was a far-fet...
3639 https://www.greenprophet.com/2022/10/plastic-bags-clog-storm-sewers-lebanon/ www.greenprophet.com Plastic bags clog storm sewers, killing man in Lebanon False 3912 3676 0.06 window.dojoRequire ( [ `` mojo/signup-forms/Loader '' ], function ( L) { L.start ( { `` baseUrl '': '' mc.us4.list-manage.com '', '' uuid '': '' 2a6df7ce0f3... This week was a sneak peak into what the rainy season will be like in Lebanon. In some cities floods engulfed cars up their windows, while in others the roo...
3088 https://www.greenprophet.com/2022/09/solar-panels-lebanon/ www.greenprophet.com Lebanese install solar panels to survive energy crisis False 4466 4230 0.05 window.dojoRequire ( [ `` mojo/signup-forms/Loader '' ], function ( L) { L.start ( { `` baseUrl '': '' mc.us4.list-manage.com '', '' uuid '': '' 2a6df7ce0f3... Lebanon is facing a severe energy crisis forcing people to run private generators for switching on light bulbs and refrigerators. This is following decades ...
4738 https://www.greenprophet.com/2023/01/should-i-replace-my-gas-stove/ www.greenprophet.com Should I replace my gas stove? False 4813 4577 0.05 window.dojoRequire ( [ `` mojo/signup-forms/Loader '' ], function ( L) { L.start ( { `` baseUrl '': '' mc.us4.list-manage.com '', '' uuid '': '' 2a6df7ce0f3... A new study finds 12.7% of childhood asthma in the US is caused by gas stoves This week a US Consumer Product Safety Commission rep told Bloomberg News they...
Text Only
1
Top heavy removals — PATENTS:
title country_code publication_date raw_len clean_len removed_pct abstract abstract_clean
68962 Forming ester-substituted polymers for organic photovoltaics US 20230323 515 262 0.49 A method comprised of combining \n \n \n \n \n \n \n \n \n \n \n \... A method comprised of combining\nform a solution containing a polymer\nIn this polymer R, R′, and R″ are independently selected from the group consisting of...
68734 Forming ester-substituted polymers for organic photovoltaics US 20230323 515 262 0.49 A method comprised of combining \n \n \n \n \n \n \n \n \n \n \n \... A method comprised of combining\nform a solution containing a polymer\nIn this polymer R, R′, and R″ are independently selected from the group consisting of...
68689 Forming ester-substituted polymers for organic photovoltaics US 20230323 515 262 0.49 A method comprised of combining \n \n \n \n \n \n \n \n \n \n \n \... A method comprised of combining\nform a solution containing a polymer\nIn this polymer R, R′, and R″ are independently selected from the group consisting of...
68946 Forming ester-substituted polymers for organic photovoltaics US 20230323 515 262 0.49 A method comprised of combining \n \n \n \n \n \n \n \n \n \n \n \... A method comprised of combining\nform a solution containing a polymer\nIn this polymer R, R′, and R″ are independently selected from the group consisting of...
68831 Forming ester-substituted polymers for organic photovoltaics US 20230323 515 262 0.49 A method comprised of combining \n \n \n \n \n \n \n \n \n \n \n \... A method comprised of combining\nform a solution containing a polymer\nIn this polymer R, R′, and R″ are independently selected from the group consisting of...
68905 Forming ester-substituted polymers for organic photovoltaics US 20230323 515 262 0.49 A method comprised of combining \n \n \n \n \n \n \n \n \n \n \n \... A method comprised of combining\nform a solution containing a polymer\nIn this polymer R, R′, and R″ are independently selected from the group consisting of...
68912 Forming ester-substituted polymers for organic photovoltaics US 20230323 515 262 0.49 A method comprised of combining \n \n \n \n \n \n \n \n \n \n \n \... A method comprised of combining\nform a solution containing a polymer\nIn this polymer R, R′, and R″ are independently selected from the group consisting of...
69019 Forming ester-substituted polymers for organic photovoltaics US 20230323 515 262 0.49 A method comprised of combining \n \n \n \n \n \n \n \n \n \n \n \... A method comprised of combining\nform a solution containing a polymer\nIn this polymer R, R′, and R″ are independently selected from the group consisting of...
68748 Forming ester-substituted polymers for organic photovoltaics US 20230323 515 262 0.49 A method comprised of combining \n \n \n \n \n \n \n \n \n \n \n \... A method comprised of combining\nform a solution containing a polymer\nIn this polymer R, R′, and R″ are independently selected from the group consisting of...
320897 Fused dithieno benzothiadiazole polymers for organic photovoltaics US 20230525 787 510 0.35 A composition comprising \n \n \n \n \n \n \n \n \n \n In this composition Ar1 is in... A composition comprising\nIn this composition Ar1 is independently selected from the group consisting of:\nand Ar2 is selected from\nAdditionally in this co...
320909 Fused dithieno benzothiadiazole polymers for organic photovoltaics US 20230525 787 510 0.35 A composition comprising \n \n \n \n \n \n \n \n \n \n In this composition Ar1 is in... A composition comprising\nIn this composition Ar1 is independently selected from the group consisting of:\nand Ar2 is selected from\nAdditionally in this co...
321380 Fused dithieno benzothiadiazole polymers for organic photovoltaics US 20230525 787 510 0.35 A composition comprising \n \n \n \n \n \n \n \n \n \n In this composition Ar1 is in... A composition comprising\nIn this composition Ar1 is independently selected from the group consisting of:\nand Ar2 is selected from\nAdditionally in this co...
321608 Fused dithieno benzothiadiazole polymers for organic photovoltaics US 20230525 787 510 0.35 A composition comprising \n \n \n \n \n \n \n \n \n \n In this composition Ar1 is in... A composition comprising\nIn this composition Ar1 is independently selected from the group consisting of:\nand Ar2 is selected from\nAdditionally in this co...
321247 Fused dithieno benzothiadiazole polymers for organic photovoltaics US 20230525 787 510 0.35 A composition comprising \n \n \n \n \n \n \n \n \n \n In this composition Ar1 is in... A composition comprising\nIn this composition Ar1 is independently selected from the group consisting of:\nand Ar2 is selected from\nAdditionally in this co...
320796 Fused dithieno benzothiadiazole polymers for organic photovoltaics US 20230525 787 510 0.35 A composition comprising \n \n \n \n \n \n \n \n \n \n In this composition Ar1 is in... A composition comprising\nIn this composition Ar1 is independently selected from the group consisting of:\nand Ar2 is selected from\nAdditionally in this co...

4) Domain Hygiene & Non‑Article Suppression

What it does
Flags obvious non‑articles by path/title/domain (about/privacy/archives/shop/events, etc.) so they don’t become canonical later. Keeps non_article_reason for audit and adds is_article_like.

Why
Removes navigation/legal/store pages before dedup; improves canonical selection quality.

Python
# --- 5) Domain Hygiene & Non‑Article Suppression ---
NON_ARTICLE_PATH_PATTERNS = [
    r"/(about|privacy|terms|contact|cookies?)/?",
    r"/(tag|category|topics?|author|authors|login|signup|subscribe)/",
    r"^/wp-json/", r"/wp-admin/",
    r"^/shop/", r"/store/", r"/events?/", r"/exhibit", r"/sponsor",
    r"/(archive|archives)/"
]
NON_ARTICLE_TITLE_PATTERNS = [
    r"^about\b", r"\bprivacy\b", r"\bsubscribe\b", r"^contact\b", r"\barchives?\b"
]
NON_ARTICLE_DOMAINS = [
    "shop.pv-magazine.com", "storagesummit.solarenergyevents.com", "www.decarbxpo.com"
]

PATH_RE  = re.compile("|".join(NON_ARTICLE_PATH_PATTERNS), flags=re.I)
TITLE_RE = re.compile("|".join(NON_ARTICLE_TITLE_PATTERNS), flags=re.I)

def flag_non_article(row) -> str:
    url = row.get("url_clean") or ""
    title = row.get("title_clean") or ""
    domain = (row.get("domain") or "").lower()
    try:
        path = urllib.parse.urlsplit(url).path or ""
    except Exception:
        path = ""
    if domain in NON_ARTICLE_DOMAINS: return "domain_blocklist"
    if PATH_RE.search(path):         return "path_rule"
    if TITLE_RE.search(title):       return "title_rule"
    return ""

need_cols = {"url_clean","domain","title_clean"} - set(df_media.columns)
if need_cols:
    print("[WARN] Missing columns for non-article flagging:", need_cols)
    df_media["non_article_reason"] = ""
else:
    df_media["non_article_reason"] = df_media.apply(flag_non_article, axis=1)

df_media["is_article_like"] = df_media["non_article_reason"].eq("")

print("Non-article flags (reason → count):")
display(df_media["non_article_reason"].value_counts().to_frame("count").head(15))
Text Only
1
Non-article flags (reason → count):
count
non_article_reason
19489
path_rule 577
domain_blocklist 45

5) Quality Gates (Measured)

What it does
Drops rows with effectively empty text after Step 5 but before canonicalization. Tunable thresholds, plus post‑gate histograms.

Why
Removes boilerplate and keeps real content. Prevents junk pages from serving as canonical.

Python
# --- 5) Quality Gates (Measured) ---
def drop_near_empty(df: pd.DataFrame, text_col: str, min_chars: int, scope_label: str) -> pd.DataFrame:
    if text_col not in df:
        print(f"[{scope_label}] skip: '{text_col}' missing"); return df
    before = len(df)
    df2 = df[df[text_col].fillna("").str.len() >= min_chars].copy()
    removed = before - len(df2)
    print(f"[{scope_label}] drop_near_empty '{text_col}': removed {removed}/{before} ({removed/before:.1%}), min_chars={min_chars}")
    return df2

def hist_series(s: pd.Series, title: str, xlabel: str, fname: Optional[str] = None):
    v = s.dropna().astype(float)
    if v.empty:
        print(f"[{title}] no data"); return
    plt.figure(figsize=(8,3.5))
    plt.hist(v, bins=60, alpha=0.8)
    plt.title(title); plt.xlabel(xlabel); plt.ylabel("count"); plt.tight_layout()
    if fname: plt.savefig(FIGS_DIR / fname, dpi=120)
    plt.show()

MEDIA_MIN_CHARS   = 200   # tune with plots (try 120–300)
PATENT_MIN_CHARS  = 40

media_pre_gate = df_media[df_media["is_article_like"]]
media_q        = drop_near_empty(media_pre_gate, "content_clean",  MEDIA_MIN_CHARS,  "MEDIA")
patents_q      = drop_near_empty(df_patents,     "abstract_clean", PATENT_MIN_CHARS, "PATENTS")

print("Rows after gates → MEDIA:", len(media_q), " PATENTS:", len(patents_q))

if "content_char_count" in media_q:
    hist_series(media_q["content_char_count"], "MEDIA: char length after gate", "chars",
                fname="media_q_charlen.png")
if "abstract_char_count" in patents_q:
    hist_series(patents_q["abstract_char_count"], "PATENTS: char length after gate", "chars",
                fname="patents_q_charlen.png")
Text Only
1
2
3
[MEDIA] drop_near_empty 'content_clean': removed 0/19489 (0.0%), min_chars=200
[PATENTS] drop_near_empty 'abstract_clean': removed 131/406857 (0.0%), min_chars=40
Rows after gates → MEDIA: 19489  PATENTS: 406726

png

png

6) Candidate Keys & Duplicate Pressure

Description Identify and collapse duplicates using deterministic keys and lightweight fingerprints, then select a single canonical record per cluster with a transparent, reproducible policy.

  • Media: merge by (a) strict url_key, (b) exact‐content hash content_sha1 (after conservative normalization), and (c) token-set title fingerprints title_fp within the same domain, gated by length/date to avoid over-merging.
  • Patents: duplication is structural (many rows per publication_number). We keep one canonical row per publication based on content richness and mild Englishness, and retain a links table for full provenance.

Input

  • df_media with: url/url_clean, domain, title, content_clean, published_dt, is_listing, plus word/char counts.
  • df_patents with: publication_number, application_number, title, abstract_clean, publication_date_dt, plus word/char counts. This step also (re)computes, if missing:
  • Media: content_norm → content_sha1, title_fp, url_key, content_word_count.
  • Patents: abstract_norm → abstract_sha1, title_fp, publication_date_dt, abstract_word_count.

Output Diagnostics (from the previous sub-step): media_dupes_by_*.csv, patent_dupes_by_*.csv. Canonicalization artifacts (saved under reports/exploration/):

  • media_canonical.csv — one row per canonical media document.
  • media_dupe_links.csv — one row per removed duplicate: member_id → canonical_id with reason and group_key.
  • patent_canonical.csv — one row per publication_number.
  • patent_dupe_links.csv — mapping of member row → canonical row (publication_number). QA invariant enforced: (rows removed) == (rows in links table).

Cleaning Objective

  • Media: collapse exact and near-exact duplicates while preferring a high-quality, non-listing representative; keep a full audit trail so dropped items remain traceable.
  • Patents: reduce to a single, content-rich record per publication, preserving links so later normalization (e.g., CPC/inventor junctions) and GraphRAG provenance remain intact.

Why data-driven The keys/fingerprints expose where duplicate pressure is highest (e.g., identical URLs/content; repeated templates). Title-based merges are gated by measurable signals (length ratio ≥ 0.90; date span ≤ 7 days), preventing accidental merges of legitimately different stories. For patents, the per-publication collapse directly follows the observed many-rows-per-ID structure and yields exactly one representative per publication_number.

Python
# --- 6) Candidate Keys & Duplicate Pressure — refined & canonicalization ---
import unicodedata
from collections import defaultdict
from dataclasses import dataclass
from typing import Iterable

def normalize_for_hash(text: str) -> str:
    if not isinstance(text, str):
        return ""
    s = unicodedata.normalize("NFKC", text)
    s = s.lower()
    s = re.sub(r"https?://\S+|www\.\S+", " ", s)
    s = re.sub(r"\S+@\S+", " ", s)
    s = re.sub(r"[ \t]+", " ", s)
    s = re.sub(r"\s*\n\s*", "\n", s).strip()
    return s

def text_sha1(s: str) -> str:
    return hashlib.sha1((s or "").encode("utf-8", "ignore")).hexdigest()

STOPWORDS = {
    "the","a","an","and","or","of","to","in","for","on","with","by","at","from",
    "is","are","was","were","as","that","this","these","those","be","it","its"
}

def title_fingerprint(title: str) -> str:
    if not isinstance(title, str) or not title.strip():
        return ""
    s = unicodedata.normalize("NFKC", title).lower()
    toks = re.findall(r"[a-z0-9]+", s)
    toks = [t for t in toks if t not in STOPWORDS and len(t) > 1]
    if not toks:
        return ""
    uniq = sorted(set(toks))
    return "|".join(uniq)

_TRACK_RE = re.compile(r"^(utm_|fbclid$|gclid$|mc_cid$|mc_eid$)", re.I)

def to_url_key(url_like: str) -> str:
    if not isinstance(url_like, str) or not url_like:
        return ""
    try:
        u = urllib.parse.urlsplit(url_like)
        host = (u.netloc or "").lower()
        if host.startswith("www."):
            host = host[4:]
        path = u.path or "/"
        if len(path) > 1 and path.endswith("/"):
            path = path[:-1]
        q = [(k, v) for (k, v) in urllib.parse.parse_qsl(u.query, keep_blank_values=True)
             if not _TRACK_RE.match(k)]
        q.sort()
        q_str = urllib.parse.urlencode(q)
        return f"{host}{path}" + (f"?{q_str}" if q_str else "")
    except Exception:
        return url_like.strip()

def duplicate_report(df: pd.DataFrame, key: str) -> pd.DataFrame:
    if key not in df.columns:
        return pd.DataFrame(columns=[key, "dupe_count"])
    g = df[df.duplicated(subset=key, keep=False)]
    if g.empty:
        return pd.DataFrame(columns=[key, "dupe_count"])
    out = (g.groupby(key).size().reset_index(name="dupe_count")
             .sort_values("dupe_count", ascending=False))
    return out

# --- Work copies for canonicalization (use gated inputs) ---
df_media_for_canon   = media_q.copy()
df_patents_for_canon = patents_q.copy()

# --- Ensure features exist (idempotent) on the working frames ---

# MEDIA
if "content_clean" in df_media_for_canon.columns and "content_sha1" not in df_media_for_canon.columns:
    df_media_for_canon["content_norm"] = df_media_for_canon["content_clean"].map(normalize_for_hash)
    df_media_for_canon["content_sha1"] = df_media_for_canon["content_norm"].map(text_sha1)

if "content_word_count" not in df_media_for_canon.columns and "content_clean" in df_media_for_canon.columns:
    df_media_for_canon["content_word_count"] = (
        df_media_for_canon["content_clean"].fillna("").astype(str).str.split().map(len, na_action="ignore")
    )

if "title" in df_media_for_canon.columns and "title_fp" not in df_media_for_canon.columns:
    df_media_for_canon["title_fp"] = df_media_for_canon["title"].map(title_fingerprint)

if "title_fp" in df_media_for_canon.columns:
    df_media_for_canon["title_fp"] = df_media_for_canon["title_fp"].replace("", np.nan)

if "published_dt" not in df_media_for_canon.columns and "date" in df_media_for_canon.columns:
    df_media_for_canon["published_dt"] = pd.to_datetime(df_media_for_canon["date"], errors="coerce")

if "url_key" not in df_media_for_canon.columns:
    src_url = "url_clean" if "url_clean" in df_media_for_canon.columns else "url"
    df_media_for_canon["url_key"] = df_media_for_canon[src_url].map(to_url_key)

# PATENTS
if "abstract_clean" in df_patents_for_canon.columns and "abstract_sha1" not in df_patents_for_canon.columns:
    df_patents_for_canon["abstract_norm"] = df_patents_for_canon["abstract_clean"].map(normalize_for_hash)
    df_patents_for_canon["abstract_sha1"] = df_patents_for_canon["abstract_norm"].map(text_sha1)

if "title" in df_patents_for_canon.columns and "title_fp" not in df_patents_for_canon.columns:
    df_patents_for_canon["title_fp"] = df_patents_for_canon["title"].map(title_fingerprint)

if "title_fp" in df_patents_for_canon.columns:
    df_patents_for_canon["title_fp"] = df_patents_for_canon["title_fp"].replace("", np.nan)

if "publication_date_dt" not in df_patents_for_canon.columns and "publication_date" in df_patents_for_canon.columns:
    df_patents_for_canon["publication_date_dt"] = pd.to_datetime(df_patents_for_canon["publication_date"], errors="coerce")

if "abstract_word_count" not in df_patents_for_canon.columns and "abstract_clean" in df_patents_for_canon.columns:
    df_patents_for_canon["abstract_word_count"] = (
        df_patents_for_canon["abstract_clean"].fillna("").astype(str).str.split().map(len, na_action="ignore")
    )

# --- Canonical selection helpers ---
@dataclass
class MediaCanonPrefs:
    min_len_ratio: float = 0.90
    max_dt_diff_days: int = 7
    require_nonlisting_canon: bool = True

def pick_media_canonical(group: pd.DataFrame) -> str:
    g = group.copy()
    if "is_listing" not in g.columns:
        g["is_listing"] = False
    if "content_word_count" not in g.columns:
        g["content_word_count"] = g.get("content_clean", "").astype(str).str.split().map(len, na_action="ignore")
    if "published_dt" not in g.columns:
        g["published_dt"] = pd.NaT
    title_len = g.get("title_clean", g.get("title", "")).astype(str).str.len()
    url_key = g.get("url_key", g.get("url_clean", g.get("url", ""))).astype(str)

    order = g.assign(
        _is_listing = g["is_listing"].fillna(False).astype(bool),
        _words      = g["content_word_count"].fillna(-1).astype(int),
        _dt         = pd.to_datetime(g["published_dt"], errors="coerce"),
        _tlen       = title_len.fillna(0).astype(int),
        _urlk       = url_key.fillna(""),
    ).sort_values(
        by=["_is_listing", "_words", "_dt", "_tlen", "_urlk"],
        ascending=[True, False, True, False, True],
        kind="mergesort"
    )
    return str(order.index[0])

def group_level_len_ratio(group: pd.DataFrame, col: str) -> float:
    vals = group[col].fillna(0).astype(int).tolist()
    if not vals:
        return 1.0
    return (min(vals) / max(vals)) if max(vals) > 0 else 1.0

def group_level_date_span_days(group: pd.DataFrame, col: str) -> Optional[int]:
    if col not in group.columns:
        return None
    dt = pd.to_datetime(group[col], errors="coerce").dropna()
    if dt.empty:
        return None
    return int((dt.max() - dt.min()).days)

class UnionFind:
    def __init__(self):
        self.parent = {}

    def find(self, x):
        if x not in self.parent:
            self.parent[x] = x
        if self.parent[x] != x:
            self.parent[x] = self.find(self.parent[x])
        return self.parent[x]

    def union(self, a, b):
        ra, rb = self.find(a), self.find(b)
        if ra != rb:
            self.parent[rb] = ra

def canonicalize_media(df: pd.DataFrame, prefs: MediaCanonPrefs) -> tuple[pd.DataFrame, pd.DataFrame]:
    if df.empty:
        return df.copy(), pd.DataFrame(columns=["member_id","canonical_id","reason","group_key"])

    id_col = "url_clean" if "url_clean" in df.columns else "url"
    assert id_col in df.columns, "df_media must have 'url_clean' or 'url'"

    uf = UnionFind()
    reasons_by_member = defaultdict(set)
    groupkeys_by_member = defaultdict(set)

    def merge_group(ids: Iterable[str], reason: str, key: str):
        ids = [i for i in dict.fromkeys(ids) if isinstance(i, str) and i]
        if len(ids) <= 1:
            return
        g_idx = df.set_index(id_col).loc[ids]
        canon_id = pick_media_canonical(g_idx)
        for m in ids:
            if m != canon_id:
                uf.union(canon_id, m)
                reasons_by_member[m].add(reason)
                groupkeys_by_member[m].add(str(key))

    # A) identical URL keys (skip empties)
    if "url_key" in df.columns:
        for key, g in df.groupby("url_key", dropna=False):
            if not key or len(g) <= 1:
                continue
            merge_group(g[id_col].tolist(), reason="url_key", key=str(key))

    # B) exact content across domains — with guards
    if {"content_sha1","content_norm","content_word_count"}.issubset(df.columns):
        m = (df["content_norm"].str.len() >= 200) & (df["content_word_count"] >= 60)
        for key, g in df[m].groupby("content_sha1"):
            if key and len(g) > 1:
                merge_group(g[id_col].tolist(), reason="content_sha1", key=str(key))

    # C) title FP within domain, gated
    if "title_fp" in df.columns and "domain" in df.columns:
        guard_col = "content_word_count" if "content_word_count" in df.columns else None
        for (dom, fp), g in df[df["title_fp"].notna()].groupby(["domain", "title_fp"]):
            if len(g) <= 1:
                continue
            ok_len = True
            if guard_col:
                ok_len = group_level_len_ratio(g, guard_col) >= prefs.min_len_ratio
            ok_dt = True
            span = group_level_date_span_days(g, "published_dt")
            if span is not None:
                ok_dt = span <= prefs.max_dt_diff_days
            if not (ok_len and ok_dt):
                continue
            if prefs.require_nonlisting_canon and "is_listing" in g.columns and g["is_listing"].all():
                continue
            merge_group(g[id_col].tolist(), reason="title_fp_in_domain_len90_dt7d", key=f"{dom}||{fp}")

    member_ids = df[id_col].astype(str)
    root_of = {m: uf.find(m) for m in member_ids}

    df_with_root = df.copy()
    df_with_root["__root"] = df_with_root[id_col].map(root_of)

    canon_rows = []
    link_rows = []

    for root, g in df_with_root.groupby("__root", dropna=False):
        g2 = g.set_index(id_col)
        final_canon_id = pick_media_canonical(g2) if len(g2) > 0 else str(root)

        gg = g.copy()
        if "is_listing" not in gg.columns: gg["is_listing"] = False
        if "content_word_count" not in gg.columns:
            gg["content_word_count"] = gg.get("content_clean","").astype(str).str.split().map(len, na_action="ignore")
        if "published_dt" not in gg.columns: gg["published_dt"] = pd.NaT
        title_len = gg.get("title_clean", gg.get("title","")).astype(str).str.len()
        url_key  = gg.get("url_key", gg.get("url_clean", gg.get("url",""))).astype(str)

        ranked = gg.assign(
            __is_final = (gg[id_col] == final_canon_id),
            _is_listing = gg["is_listing"].fillna(False).astype(bool),
            _words      = gg["content_word_count"].fillna(-1).astype(int),
            _dt         = pd.to_datetime(gg["published_dt"], errors="coerce"),
            _tlen       = title_len.fillna(0).astype(int),
            _urlk       = url_key.fillna("")
        ).sort_values(
            by=["__is_final","_is_listing","_words","_dt","_tlen","_urlk"],
            ascending=[False, True, False, True, False, True],
            kind="mergesort"
        )

        canon_row = ranked.iloc[0]
        canon_rows.append(canon_row)

        for _, r in ranked.iloc[1:].iterrows():
            member_id = str(r[id_col])
            reasons = set(reasons_by_member.get(member_id, set() ))
            if member_id == final_canon_id:
                reasons.add("same_id_duplicate")
            if not reasons:
                reasons.add("union")
            link_rows.append({
                "member_id": member_id,
                "canonical_id": final_canon_id,
                "reason": "|".join(sorted(reasons)),
                "group_key": "|".join(sorted(groupkeys_by_member.get(member_id, set())))
            })

    canon_df = pd.DataFrame(canon_rows).reset_index(drop=True)
    links_df = pd.DataFrame(link_rows, columns=["member_id","canonical_id","reason","group_key"])

    canon_out = REPORTS_DIR / "media_canonical.csv"
    links_out = REPORTS_DIR / "media_dupe_links.csv"
    canon_df.to_csv(canon_out, index=False)
    links_df.to_csv(links_out, index=False)
    removed = len(df) - len(canon_df)
    if removed != len(links_df):
        print(f"[WARN] removed={removed} but links={len(links_df)}; investigate groups with identical IDs.")
    else:
        print(f"[OK] Invariant holds: removed={removed} == links={len(links_df)}")

    print(f"[MEDIA] Canonical rows: {len(canon_df)}  (from {len(df)})  → {canon_out.name}")
    print(f"[MEDIA] Dupe links   : {len(links_df)}{links_out.name}")

    return canon_df, links_df

EN_TOKENS = {"the","and","for","with","from","this","that","method","system","device","composition","process","use"}

def englishness_score(s: str) -> float:
    if not isinstance(s, str) or not s:
        return 0.0
    ascii_ratio = sum(1 for ch in s if ord(ch) < 128) / max(1, len(s))
    toks = re.findall(r"[a-z]+", s.lower())
    en_hits = sum(t in EN_TOKENS for t in toks)
    return ascii_ratio + 0.05 * en_hits

def pick_patent_canonical(group: pd.DataFrame) -> int:
    g = group.copy()
    words = g.get("abstract_word_count", pd.Series([-1]*len(g), index=g.index)).fillna(-1)
    en   = g.get("abstract_clean", "").map(englishness_score)
    dt   = pd.to_datetime(g.get("publication_date_dt"), errors="coerce")
    tlen = g.get("title_clean", g.get("title", "")).astype(str).str.len()
    order = g.assign(_w=words.astype(int),
                     _e=en.astype(float),
                     _dt=dt,
                     _t=tlen.astype(int)).sort_values(
        by=["_w","_e","_dt","_t"],
        ascending=[False, False, True, False],
        kind="mergesort"
    )
    return int(order.index[0])

def canonicalize_patents(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
    if df.empty or "publication_number" not in df.columns:
        return df.copy(), pd.DataFrame(columns=["member_index","publication_number","canonical_index","reason"])

    links = []
    canon_rows = []

    for pub, g in df.groupby("publication_number", dropna=False):
        if len(g) == 1:
            canon_rows.append(g.iloc[0])
            continue

        pick_idx = pick_patent_canonical(g)
        pick_row = g.loc[pick_idx]
        canon_rows.append(pick_row)

        for member_idx in g.index:
            if member_idx != pick_idx:
                links.append({
                    "member_index": int(member_idx),
                    "publication_number": pub,
                    "canonical_index": int(pick_idx),
                    "reason": "publication_number"
                })

    canon_df = pd.DataFrame(canon_rows).reset_index(drop=True)
    links_df = pd.DataFrame(links)

    canon_out = REPORTS_DIR / "patent_canonical.csv"
    links_out = REPORTS_DIR / "patent_dupe_links.csv"
    canon_df.to_csv(canon_out, index=False)
    links_df.to_csv(links_out, index=False)
    print(f"[PATENTS] Canonical rows: {len(canon_df)}  (from {len(df)})  → {canon_out.name}")
    print(f"[PATENTS] Dupe links   : {len(links_df)}{links_out.name}")

    return canon_df, links_df

# ---------- Run canonicalization on gated frames ----------
media_prefs = MediaCanonPrefs(min_len_ratio=0.90, max_dt_diff_days=7, require_nonlisting_canon=True)
media_canonical, media_links = canonicalize_media(df_media_for_canon, media_prefs)
patent_canonical, patent_links = canonicalize_patents(df_patents_for_canon)

def print_media_stats(df_raw: pd.DataFrame, df_can: pd.DataFrame, links: pd.DataFrame):
    removed = len(df_raw) - len(df_can)
    print(f"[MEDIA] Removed/merged duplicates: {removed} ({removed/len(df_raw):.1%})")
    if not links.empty:
        print(links["reason"].value_counts().rename("merge_reason_counts"))

def print_patent_stats(df_raw: pd.DataFrame, df_can: pd.DataFrame, links: pd.DataFrame):
    removed = len(df_raw) - len(df_can)
    print(f"[PATENTS] Rows reduced by: {removed} ({removed/len(df_raw):.1%})")
    if "publication_number" in df_raw.columns:
        pubs = df_raw["publication_number"].nunique()
        print(f"[PATENTS] Unique publications: {pubs}  → canon rows should equal this (check: {len(df_can)})")
    if not links.empty:
        print(links["reason"].value_counts().rename("merge_reason_counts"))

print_media_stats(df_media_for_canon, media_canonical, media_links)
print_patent_stats(df_patents_for_canon, patent_canonical, patent_links)
Text Only
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
[OK] Invariant holds: removed=158 == links=158
[MEDIA] Canonical rows: 19331  (from 19489)  → media_canonical.csv
[MEDIA] Dupe links   : 158                 → media_dupe_links.csv
[PATENTS] Canonical rows: 31364  (from 406726)  → patent_canonical.csv
[PATENTS] Dupe links   : 375362                 → patent_dupe_links.csv
[MEDIA] Removed/merged duplicates: 158 (0.8%)
reason
title_fp_in_domain_len90_dt7d                 101
content_sha1                                   32
same_id_duplicate                              17
content_sha1|title_fp_in_domain_len90_dt7d      8
Name: merge_reason_counts, dtype: int64
[PATENTS] Rows reduced by: 375362 (92.3%)
[PATENTS] Unique publications: 31364  → canon rows should equal this (check: 31364)
reason
publication_number    375362
Name: merge_reason_counts, dtype: int64

Data Notes

  • Media

  • url_key normalizes host/path/query (drops tracking params, ignores scheme/www, trims trailing slashes) to catch trivial duplicates.

  • content_sha1 collapses syndicated/mirrored articles (exact text match), with guards to avoid merging boilerplate/very short pages.
  • title_fp merges are domain-scoped and gated (length ratio & date window) to target archive/profile/template pages that differ only superficially. Canonical selection always prefers non-listing over listing pages when both exist.
  • The links table provides a transparent audit trail with reason ∈ {url_key, content_sha1, title_fp_in_domain_len90_dt7d, same_id_duplicate}.

  • Patents

  • Canonicalization is strictly within publication_number; we do not merge across publications even if title_fp/abstract_sha1 collide (those collisions often reflect translations or family filings).

  • The canonical row favors longer abstracts and mild Englishness, then earliest publication date and longer title to produce a single, high-signal representative.
  • All member→canonical mappings are preserved for downstream normalization (e.g., emitting CPC and inventor junction tables) and GraphRAG provenance.

7) Patent normalization (aggregate CPC & inventors)

What it does
Aggregates all CPC codes and inventors to one row per publication_number.
Prefers longest title/abstract; keeps earliest publication date.

Why
Downstream tasks (search/stats/RAG) usually want one record per publication with multi‑value fields aggregated.

Python
# --- 7) Patent normalization (aggregate CPC & inventors) ---
# 1) Robust date parser: 8-digit YYYYMMDD, 10/13-digit epochs, strings OK
def parse_date_col(s: pd.Series, name: str = "date") -> pd.Series:
    ss = s.astype(str).str.strip()
    out = pd.to_datetime(ss, errors="coerce")  # generic attempt

    # 8-digit YYYYMMDD => force format (prevents epoch misread)
    m8 = ss.str.fullmatch(r"\d{8}")
    if m8.any():
        out.loc[m8] = pd.to_datetime(ss[m8], format="%Y%m%d", errors="coerce")

    # 10-digit epoch (seconds)
    m10 = ss.str.fullmatch(r"\d{10}")
    if m10.any():
        out.loc[m10] = pd.to_datetime(ss[m10].astype("int64"), unit="s", errors="coerce")

    # 13-digit epoch (milliseconds)
    m13 = ss.str.fullmatch(r"\d{13}")
    if m13.any():
        out.loc[m13] = pd.to_datetime(ss[m13].astype("int64"), unit="ms", errors="coerce")

    return out

# 2) Reparse dates on base + gated frames
if "publication_date" in df_patents.columns:
    df_patents["publication_date_dt"] = parse_date_col(df_patents["publication_date"], "publication_date")

if "patents_q" in globals():
    patents_q = patents_q.copy()
    if "publication_date" in patents_q.columns:
        patents_q["publication_date_dt"] = parse_date_col(patents_q["publication_date"], "publication_date")
else:
    # if you didn't create a gated set, fall back to the base table
    patents_q = df_patents.copy()

# 3) (Re)aggregate normalized patents (only if not already defined)
def _ensure_list(x):
    if isinstance(x, list): return x
    if isinstance(x, tuple): return list(x)
    if isinstance(x, str) and x.strip().startswith("[") and x.strip().endswith("]"):
        try:
            y = ast.literal_eval(x)
            if isinstance(y, (list, tuple)): return list(map(str, y))
        except Exception:
            pass
    if pd.isna(x): return []
    return [str(x)]

def pick_longest_text(series: pd.Series) -> str:
    s = series.fillna("").astype(str)
    return s.iloc[s.str.len().argmax()] if len(s) else ""

def uniq_sorted_str(vals: list[str]) -> str:
    return "; ".join(sorted({v for v in vals if v}))

def aggregate_patents(df: pd.DataFrame) -> pd.DataFrame:
    if "publication_number" not in df.columns:
        raise ValueError("publication_number required for patent normalization")
    d = df.copy()
    d["inv_list"] = d["inventor"].apply(_ensure_list) if "inventor" in d else [[]]
    d["cpc_list"] = d["cpc_code"].apply(_ensure_list) if "cpc_code" in d else [[]]

    agg = (d.sort_values(["publication_date_dt"], ascending=True)
             .groupby("publication_number", as_index=False, dropna=False)
             .agg({
                 "application_number": lambda s: "; ".join(sorted(set(map(str, s)))),
                 "country_code":       lambda s: "; ".join(sorted(set(map(str, s)))),
                 "title_clean":        pick_longest_text,
                 "abstract_clean":     pick_longest_text,
                 "publication_date_dt":"first",
                 "inv_list":           lambda s: sorted(set(itertools.chain.from_iterable(s))),
                 "cpc_list":           lambda s: sorted(set(itertools.chain.from_iterable(s))),
             }))

    agg["inventors"] = agg["inv_list"].apply(uniq_sorted_str)
    agg["cpc_codes"] = agg["cpc_list"].apply(uniq_sorted_str)
    return agg.drop(columns=["inv_list","cpc_list"])

patents_norm = aggregate_patents(patents_q)
print("Patents normalized →", patents_norm.shape)
Text Only
1
Patents normalized → (31364, 8)

8) Date Coverage Plots (post‑clean)

What it does
Monthly counts for media (canonical) and patents (normalized) to spot gaps/spikes.

Why
Visual checks catch ingestion or cleaning issues.

Python
# MEDIA timeline
if "published_dt" in media_canonical:
    ok = media_canonical["published_dt"].notna().mean()
    print(f"MEDIA: published_dt parse success = {ok:.1%} "
          f"(min={media_canonical['published_dt'].min()}, max={media_canonical['published_dt'].max()})")
    media_month = media_canonical["published_dt"].dt.to_period("M").value_counts().sort_index()
    plt.figure(figsize=(10,3.5)); plt.bar(media_month.index.astype(str), media_month.values)
    plt.title("MEDIA: articles per month (clean+canonical)"); plt.xticks(rotation=45, ha="right"); plt.tight_layout()
    plt.savefig(FIGS_DIR / "media_month_counts.png", dpi=120); plt.show()

# PATENTS timeline (normalized)
if "publication_date_dt" in patents_norm:
    ok = patents_norm["publication_date_dt"].notna().mean()
    print(f"PATENTS: publication_date_dt parse success = {ok:.1%} "
          f"(min={patents_norm['publication_date_dt'].min()}, max={patents_norm['publication_date_dt'].max()})")
    pat_month = patents_norm["publication_date_dt"].dt.to_period("M").value_counts().sort_index()
    plt.figure(figsize=(10,3.5)); plt.bar(pat_month.index.astype(str), pat_month.values)
    plt.title("PATENTS: publications per month (normalized)"); plt.xticks(rotation=45, ha="right"); plt.tight_layout()
    plt.savefig(FIGS_DIR / "patents_month_counts.png", dpi=120); plt.show()
Text Only
1
MEDIA: published_dt parse success = 100.0% (min=2022-01-02 00:00:00, max=2024-10-24 00:00:00)

png

Text Only
1
PATENTS: publication_date_dt parse success = 100.0% (min=2022-01-01 00:00:00, max=2024-09-05 00:00:00)

png

9) Top Domains (pre/post) & Length Diagnostics

What it does
Shows where content comes from and how cleaning changes it (pre vs post).

Why
Highlights sources that need special handling or exclusion rules.

Python
# Pre-clean top domains (use raw)
if "url" in df_media_raw.columns:
    pre_domains = df_media_raw["url"].map(extract_domain).value_counts().head(20)
    pre_domains.to_csv(REPORTS_DIR / "media_top_domains_pre.csv")
    print("MEDIA: top domains (raw)"); display(pre_domains.to_frame("count"))

# Post-clean top domains (canonical)
if "domain" in media_canonical.columns:
    post_domains = media_canonical["domain"].value_counts().head(20)
    post_domains.to_csv(REPORTS_DIR / "media_top_domains_post.csv")
    print("MEDIA: top domains (after cleaning+canonical)"); display(post_domains.to_frame("count"))

# Length summary (sanity)
if {"content","content_clean"}.issubset(df_media.columns):
    comp = pd.DataFrame({
        "raw_chars":   df_media["content"].astype(str).str.len(),
        "clean_chars": df_media["content_clean"].astype(str).str.len()
    })
    print("MEDIA length summary (raw vs clean):"); display(comp.describe().T)
Text Only
1
MEDIA: top domains (raw)
count
url
www.energy-xprt.com 4181
www.pv-magazine.com 3070
www.azocleantech.com 2488
cleantechnica.com 2088
www.pv-tech.org 1969
www.thinkgeoenergy.com 1052
www.solarpowerportal.co.uk 850
www.solarpowerworldonline.com 778
www.energyvoice.com 664
solarindustrymag.com 620
solarquarter.com 590
www.rechargenews.com 573
www.naturalgasintel.com 297
www.iea.org 173
www.energyintel.com 170
sgvoice.energyvoice.com 164
www.greenprophet.com 130
www.greenairnews.com 58
ecofriend.com 55
forum.all-energy.co.uk 35
Text Only
1
MEDIA: top domains (after cleaning+canonical)
count
domain
www.energy-xprt.com 4049
www.pv-magazine.com 2944
www.azocleantech.com 2457
www.pv-tech.org 1966
cleantechnica.com 1752
www.thinkgeoenergy.com 1052
www.solarpowerportal.co.uk 850
www.solarpowerworldonline.com 778
www.energyvoice.com 635
solarindustrymag.com 620
solarquarter.com 590
www.rechargenews.com 557
www.naturalgasintel.com 295
www.energyintel.com 170
www.iea.org 158
sgvoice.energyvoice.com 151
www.greenprophet.com 118
www.greenairnews.com 58
ecofriend.com 53
forum.all-energy.co.uk 35
Text Only
1
MEDIA length summary (raw vs clean):
count mean std min 25% 50% 75% max
raw_chars 20,111.00 5,038.63 3,030.23 1,183.00 2,643.00 4,266.00 6,954.50 20,021.00
clean_chars 20,111.00 4,972.82 3,001.41 1,164.00 2,604.00 4,201.00 6,867.50 19,816.00

10) Language Detection

Python
DETECTOR = None

# Preferred: pycld3 (best for short text; reliability flag)
try:
    import pycld3
    def _detect_lang(text: str):
        t = (text or "").strip()
        if len(t) < 25:
            return ("unknown", 0.0)     # too short to trust
        r = pycld3.get_language(t)
        if r:
            return (r.language or "unknown", float(getattr(r, "probability", 0.0)))
        return ("unknown", 0.0)
    DETECTOR = "pycld3"
except Exception:
    # Fallback: langid (pure Python)
    try:
        import langid
        langid.set_languages(None)  # keep all languages
        def _detect_lang(text: str):
            t = (text or "").strip()
            if len(t) < 25:
                return ("unknown", 0.0)
            code, score = langid.classify(t)
            return (code, float(score))
        DETECTOR = "langid"
    except Exception:
        # Fallback: langdetect (seed for determinism)
        try:
            from langdetect import detect, DetectorFactory
            DetectorFactory.seed = 0
            def _detect_lang(text: str):
                t = (text or "").strip()
                if len(t) < 25:
                    return ("unknown", 0.0)
                try:
                    return (detect(t), 1.0)  # no score available; treat as 1.0
                except Exception:
                    return ("unknown", 0.0)
            DETECTOR = "langdetect"
        except Exception:
            DETECTOR = None

def add_lang(df: pd.DataFrame, text_col: str, code_col: str, score_col: str) -> pd.DataFrame:
    if DETECTOR is None:
        print("Language detection skipped: no detector available.")
        return df
    if text_col not in df.columns:
        print(f"Language detection skipped: '{text_col}' missing.")
        return df
    out = df[text_col].fillna("").astype(str).map(_detect_lang)
    df[code_col]  = out.map(lambda x: x[0])
    df[score_col] = out.map(lambda x: x[1])
    vc = df[code_col].value_counts().head(10)
    print(f"Language via {DETECTOR} on '{text_col}' → top codes:"); display(vc.to_frame("count"))
    return df
Python
1
2
3
4
5
# Apply to final tables
media_canonical = add_lang(media_canonical, "content_clean", "lang_content", "lang_content_score")
media_canonical = add_lang(media_canonical, "title_clean", "lang_title", "lang_title_score")
patents_norm    = add_lang(patents_norm,    "abstract_clean", "lang_abs", "lang_abs_score")
patents_norm    = add_lang(patents_norm,    "title_clean",    "lang_title", "lang_title_score")
Text Only
1
Language via langid on 'content_clean' → top codes:
count
lang_content
en 19325
la 2
de 2
ru 1
es 1
Text Only
1
Language via langid on 'title_clean' → top codes:
count
lang_title
en 17029
it 803
hu 549
de 171
es 116
nl 103
fr 101
mt 66
unknown 51
da 36
Text Only
1
Language via langid on 'abstract_clean' → top codes:
count
lang_abs
en 30909
fr 273
ko 101
zh 31
de 24
es 9
pl 6
ar 4
ro 2
la 1
Text Only
1
Language via langid on 'title_clean' → top codes:
count
lang_title
en 26522
fr 1111
unknown 1009
nl 670
de 521
it 434
da 301
es 187
id 107
tl 65

11) Save Silver Drafts + Audit Slices

What it does
Writes Parquet (or CSV.GZ fallback) drafts and small head samples for quick QA.

Why
Provides stable handoffs to downstream consumers and preserves provenance.

Python
## For RAG/analytics: use canonical media + normalized patents
save_kaggle(media_canonical,  f"media_canonical_{len(media_canonical)}")
save_kaggle(patents_norm,    f"patents_normalized_{len(patents_norm)}")

# Audit CSVs
#(media_canonical[["url_clean","domain","published_dt","title_clean","content_char_count"]]
# .head(50).to_csv(REPORTS_DIR / "media_canonical_head50.csv", index=False))
#(patents_norm[["publication_number","publication_date_dt","title_clean","abstract_char_count","country_code","cpc_codes"]]
# .head(50).to_csv(REPORTS_DIR / "patents_norm_head50.csv", index=False))

#print("Audit CSVs written to", REPORTS_DIR)
Text Only
1
2
3
4
5
6
7
8
Kaggle Silver → C:\Users\gerbe\PycharmProjects\MT\notebooks\cleantech_data\silver\kaggle\2025-08-09\media_canonical_19331.parquet
Kaggle Silver → C:\Users\gerbe\PycharmProjects\MT\notebooks\cleantech_data\silver\kaggle\2025-08-09\patents_normalized_31364.parquet





WindowsPath('C:/Users/gerbe/PycharmProjects/MT/notebooks/cleantech_data/silver/kaggle/2025-08-09/patents_normalized_31364.parquet')

OpenAlex

Visual Overview


Visual Overview


cleaning pipeline for OpenAlex Topics, matching the tone/structure you used for media & patents. It:

  • Reads Bronze JSONL (stream-safe).
  • Builds canonical Silver tables: topics_canonical, topic_keywords_m2m, topic_siblings_m2m, and a tiny taxonomy reference (domains, fields, subfields).
  • Adds language detection (robust fallbacks).
  • Computes length/coverage QA slices and EDA.
  • Saves Parquet (with CSV.GZ fallback) + optional audit CSVs.
  • Leaves clear seams for later LLM enrichment and for your triple retrieval stack (dense/bi-encoder, BM25, GraphRAG).

Replace paths as needed if your date bucket differs. I’ve set the input to your provided path: cleantech_data/bronze/openalex/topics/2025-08-09/extracted/topics.jsonl


0) Notebook Preamble

Description Sets up the notebook runtime (imports, display prefs) so downstream cells behave consistently and prints tables cleanly.

Python
import os
import re
import json
import uuid
import gzip
import math
import hashlib
import datetime as dt
from pathlib import Path
from typing import Dict, Any, List, Iterable, Optional, Tuple

import pandas as pd
import numpy as np

pd.options.display.max_rows = 20
pd.options.display.max_colwidth = 120

from IPython.display import display

1) Config & Paths

Description Resolves Bronze and Silver directories exactly like media/patent: picks the latest OpenAlex topics date bucket unless an explicit path is present; derives the matching Silver bucket.

Input PROJECT_ROOT, DATA_ROOT, latest_date_dir(...), optional explicit file: .../topics/2025-08-09/extracted/topics.jsonl.

Output topics_path, bucket_name, SILVER_OPENALEX_DIR (created), console confirmation of paths.

Cleaning Objective Deterministic routing: every Silver artifact ties back to a single Bronze snapshot.

Python
# Uses same routing helpers you already defined earlier in the notebook:
# - PROJECT_ROOT, DATA_ROOT, REPO_ROOT, REPORTS_DIR
# - latest_date_dir(...), load_latest_extracted(...), N_ROWS

# Base for OpenAlex topics Bronze
BRONZE_OPENALEX_TOPICS = Path(
    os.getenv("OPENALEX_TOPICS_DIR", PROJECT_ROOT / "cleantech_data" / "bronze" / "openalex" / "topics")
).resolve()

# Default to latest date bucket (same as media/patent sections)
openalex_date_dir = latest_date_dir(BRONZE_OPENALEX_TOPICS)
openalex_ext_dir  = openalex_date_dir / "extracted"

# Optional explicit override (your provided file). If it exists, we use it.
OPENALEX_EXPLICIT = PROJECT_ROOT / "cleantech_data" / "bronze" / "openalex" / "topics" / "2025-08-09" / "extracted" / "topics.jsonl"

if OPENALEX_EXPLICIT.exists():
    topics_path   = OPENALEX_EXPLICIT
    bucket_name   = OPENALEX_EXPLICIT.parents[1].name  # .../topics/<YYYY-MM-DD>/extracted/topics.jsonl
else:
    # Fall back to latest bucket (support both plain and gz)
    topics_path   = sorted(openalex_ext_dir.glob("topics.jsonl*"))[-1]
    bucket_name   = openalex_date_dir.name

# Silver output mirrors media/patent: silver/openalex/<bronze_date_bucket>/
SILVER_OPENALEX_DIR = (NB_DIR  / "cleantech_data" / "silver" / "openalex" / bucket_name).resolve()
SILVER_OPENALEX_DIR.mkdir(parents=True, exist_ok=True)

print("OpenAlex Bronze file:", topics_path)
print("OpenAlex Bronze bucket:", bucket_name)
print("Silver (OpenAlex) dir:", SILVER_OPENALEX_DIR)
print("Reports dir         :", REPORTS_DIR)
Text Only
1
2
3
4
OpenAlex Bronze file: C:\Users\gerbe\PycharmProjects\MT\cleantech_data\bronze\openalex\topics\2025-08-09\extracted\topics.jsonl
OpenAlex Bronze bucket: 2025-08-09
Silver (OpenAlex) dir: C:\Users\gerbe\PycharmProjects\MT\notebooks\cleantech_data\silver\openalex\2025-08-09
Reports dir         : C:\Users\gerbe\PycharmProjects\MT\notebooks\reports\exploration

2) Helpers (IO, cleaning, hashing, saving)

Description Utility functions for streaming JSONL, text normalization, safe date parsing, and saving with Parquet→CSV.GZ fallback.

Input A file path to JSONL/JSONL.GZ; DataFrames to persist.

Output Python generators/frames; files saved to Silver; logs that note fallback if Parquet engine is missing.

Python
def read_jsonl(path: Path, limit: Optional[int] = None) -> List[Dict[str, Any]]:
    """
    Memory-conscious reader. If 'limit' is given, returns only the first 'limit' objects.
    Handles .jsonl and .jsonl.gz (by extension).
    """
    rows: List[Dict[str, Any]] = []
    is_gz = str(path).endswith(".gz")
    opener = gzip.open if is_gz else open
    open_kwargs = {"mode": "rt", "encoding": "utf-8", "errors": "replace"}

    with opener(path, **open_kwargs) as f:
        for i, line in enumerate(f, 1):
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
                if isinstance(obj, dict):
                    rows.append(obj)
            except json.JSONDecodeError as exc:
                raise RuntimeError(f"Could not parse JSON/JSONL at line {i}: {exc}") from exc
            if limit and len(rows) >= limit:
                break
    return rows

def iter_jsonl(path: Path) -> Iterable[Dict[str, Any]]:
    """ Streaming iterator over JSONL/JSONL.GZ. """
    is_gz = str(path).endswith(".gz")
    opener = gzip.open if is_gz else open
    open_kwargs = {"mode": "rt", "encoding": "utf-8", "errors": "replace"}

    with opener(path, **open_kwargs) as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
                if isinstance(obj, dict):
                    yield obj
            except json.JSONDecodeError:
                continue

def slugify(s: str) -> str:
    s = (s or "").strip().lower()
    s = re.sub(r"[^\w\s-]", "", s)     # keep word chars, whitespace, hyphen
    s = re.sub(r"\s+", "-", s)         # spaces -> dashes
    s = re.sub(r"-{2,}", "-", s)       # collapse multiple dashes
    return s.strip("-")

def sha1_hex(text: str) -> str:
    return hashlib.sha1(text.encode("utf-8")).hexdigest()

def safe_to_dt(s: Any) -> Optional[pd.Timestamp]:
    try:
        return pd.to_datetime(s, errors="coerce", utc=True)
    except Exception:
        return None

def safe_save(df: pd.DataFrame, name: str) -> Path:
    """
    Save under SILVER_OPENALEX_DIR. Try Parquet first; fallback to CSV.GZ.
    Returns final path.
    """
    base = SILVER_OPENALEX_DIR / name
    try:
        path = base.with_suffix(".parquet")
        df.to_parquet(path, index=False)
        print(f"Saved Parquet → {path.name}  rows={len(df):,}")
        return path
    except Exception as exc:
        path = base.with_suffix(".csv.gz")
        df.to_csv(path, index=False, compression="gzip")
        print(f"[Fallback CSV.GZ] → {path.name}  rows={len(df):,}  ({exc})")
        return path

3) Language Detection (robust fallbacks)

Description Adds lang_title/_score and lang_desc/_score using pycld3 if available, then langid, then langdetect.

Input topics_canonical.display_name, topics_canonical.description.

Output Language code + score columns for title and description.

Cleaning Objective Enable language filtering and QA on text coverage; catch multilingual anomalies early.

Why data-driven Empirical language signals guide later filtering, enrichment, and evaluation.

Python
DETECTOR = None

try:
    import pycld3
    def _detect_lang(text: str):
        t = (text or "").strip()
        if len(t) < 25:
            return ("unknown", 0.0)
        r = pycld3.get_language(t)
        if r:
            return (r.language or "unknown", float(getattr(r, "probability", 0.0)))
        return ("unknown", 0.0)
    DETECTOR = "pycld3"
except Exception:
    try:
        import langid
        langid.set_languages(None)
        def _detect_lang(text: str):
            t = (text or "").strip()
            if len(t) < 25:
                return ("unknown", 0.0)
            code, score = langid.classify(t)
            return (code, float(score))
        DETECTOR = "langid"
    except Exception:
        try:
            from langdetect import detect, DetectorFactory
            DetectorFactory.seed = 0
            def _detect_lang(text: str):
                t = (text or "").strip()
                if len(t) < 25:
                    return ("unknown", 0.0)
                try:
                    return (detect(t), 1.0)
                except Exception:
                    return ("unknown", 0.0)
            DETECTOR = "langdetect"
        except Exception:
            DETECTOR = None

def add_lang_cols(df: pd.DataFrame, text_col: str, code_col: str, score_col: str) -> pd.DataFrame:
    if DETECTOR is None or text_col not in df.columns:
        print(f"Language detection skipped for {text_col} (detector={DETECTOR}).")
        return df
    out = df[text_col].fillna("").astype(str).map(_detect_lang)
    df[code_col]  = out.map(lambda x: x[0])
    df[score_col] = out.map(lambda x: x[1])
    vc = df[code_col].value_counts().head(10)
    print(f"Lang via {DETECTOR} on '{text_col}' → top codes:")
    display(vc.to_frame("count"))
    return df

Data Notes Detector used: langid (negative scores are expected). Title: mostly en, small tail (unknown, de, fr, …). Description: en for all 4,516.

4) Load a Sample & Peek (Schema sanity)

Description Reads a tiny slice to validate schema and nested keys before full load.

Input topics_path (JSONL).

Output A normalized preview with columns like id, display_name, description, keywords, siblings, taxonomy fields, counts, dates.

Cleaning Objective Confirm field presence/types to prevent surprises mid-pipeline.

Python
1
2
3
sample_rows = read_jsonl(topics_path, limit=3)
print(f"Sample objects: {len(sample_rows)}")
display(pd.json_normalize(sample_rows).T)
Text Only
1
Sample objects: 3
0 1 2
id https://openalex.org/T11881 https://openalex.org/T11475 https://openalex.org/T13445
display_name Crystallization and Solubility Studies French Urban and Social Studies American Constitutional Law and Politics
description This cluster of papers focuses on the crystallization processes and control, including topics such as nucleation, so... This cluster of papers explores the intersection of territorial governance, environmental participation, and sustain... This cluster of papers explores the development of American political thought and history, focusing on topics such a...
keywords [Crystallization, Nucleation, Solubility, Polymorphism, Ultrasound-Assisted Crystallization, Process Analytical Tech... [Territorial Governance, Environmental Participation, Sustainable Development, Citizen Participation, Local Developm... [American founding, Constitutional government, Religious freedom, Public opinion, Separation of church and state, Pr...
siblings [{'id': 'https://openalex.org/T10275', 'display_name': '2D Materials and Applications'}, {'id': 'https://openalex.or... [{'id': 'https://openalex.org/T10927', 'display_name': 'Access Control and Trust'}, {'id': 'https://openalex.org/T13... [{'id': 'https://openalex.org/T14181', 'display_name': 'Academic Freedom and Politics'}, {'id': 'https://openalex.or...
works_count 975416 656763 467988
cited_by_count 792291 703244 1823982
updated_date 2025-08-04T05:30:13.285909 2025-08-04T05:21:24.303694 2025-08-04T05:26:43.888907
created_date 2024-01-23 2024-01-23 2024-01-23
ids.openalex https://openalex.org/T11881 https://openalex.org/T11475 https://openalex.org/T13445
ids.wikipedia https://en.wikipedia.org/wiki/Crystallization https://en.wikipedia.org/wiki/Territorial_governance https://en.wikipedia.org/wiki/American_political_history
subfield.id https://openalex.org/subfields/2505 https://openalex.org/subfields/3312 https://openalex.org/subfields/3320
subfield.display_name Materials Chemistry Sociology and Political Science Political Science and International Relations
field.id https://openalex.org/fields/25 https://openalex.org/fields/33 https://openalex.org/fields/33
field.display_name Materials Science Social Sciences Social Sciences
domain.id https://openalex.org/domains/3 https://openalex.org/domains/2 https://openalex.org/domains/2
domain.display_name Physical Sciences Social Sciences Social Sciences

Data Notes Sample shows expected keys and nested lists/dicts (keywords, siblings) and ISO timestamps.

5) Full Load (stream → DataFrame)

Description Streams all JSONL records into a flat DataFrame, keeping a raw copy for audits.

Input Entire topics.jsonl.

Output df_raw and df_openalex_raw.

Cleaning Objective Load everything deterministically without blowing memory.

Why data-driven Streaming + normalization scales and preserves original fields for later inspection.

Python
records: List[Dict[str, Any]] = list(iter_jsonl(topics_path))
print(f"Loaded {len(records):,} topic records from {topics_path.name}")

df_raw = pd.json_normalize(records, max_level=1)
if N_ROWS is not None:
    df_raw = df_raw.head(N_ROWS)
print("Raw columns:", list(df_raw.columns))
display(df_raw.head(3))

# Keep an untouched copy for audits (mirrors your media/patent pattern)
df_openalex_raw = df_raw.copy(deep=True)
Text Only
1
2
Loaded 4,516 topic records from topics.jsonl
Raw columns: ['id', 'display_name', 'description', 'keywords', 'siblings', 'works_count', 'cited_by_count', 'updated_date', 'created_date', 'ids.openalex', 'ids.wikipedia', 'subfield.id', 'subfield.display_name', 'field.id', 'field.display_name', 'domain.id', 'domain.display_name']
id display_name description keywords siblings works_count cited_by_count updated_date created_date ids.openalex ids.wikipedia subfield.id subfield.display_name field.id field.display_name domain.id domain.display_name
0 https://openalex.org/T11881 Crystallization and Solubility Studies This cluster of papers focuses on the crystallization processes and control, including topics such as nucleation, so... [Crystallization, Nucleation, Solubility, Polymorphism, Ultrasound-Assisted Crystallization, Process Analytical Tech... [{'id': 'https://openalex.org/T10275', 'display_name': '2D Materials and Applications'}, {'id': 'https://openalex.or... 975416 792291 2025-08-04T05:30:13.285909 2024-01-23 https://openalex.org/T11881 https://en.wikipedia.org/wiki/Crystallization https://openalex.org/subfields/2505 Materials Chemistry https://openalex.org/fields/25 Materials Science https://openalex.org/domains/3 Physical Sciences
1 https://openalex.org/T11475 French Urban and Social Studies This cluster of papers explores the intersection of territorial governance, environmental participation, and sustain... [Territorial Governance, Environmental Participation, Sustainable Development, Citizen Participation, Local Developm... [{'id': 'https://openalex.org/T10927', 'display_name': 'Access Control and Trust'}, {'id': 'https://openalex.org/T13... 656763 703244 2025-08-04T05:21:24.303694 2024-01-23 https://openalex.org/T11475 https://en.wikipedia.org/wiki/Territorial_governance https://openalex.org/subfields/3312 Sociology and Political Science https://openalex.org/fields/33 Social Sciences https://openalex.org/domains/2 Social Sciences
2 https://openalex.org/T13445 American Constitutional Law and Politics This cluster of papers explores the development of American political thought and history, focusing on topics such a... [American founding, Constitutional government, Religious freedom, Public opinion, Separation of church and state, Pr... [{'id': 'https://openalex.org/T14181', 'display_name': 'Academic Freedom and Politics'}, {'id': 'https://openalex.or... 467988 1823982 2025-08-04T05:26:43.888907 2024-01-23 https://openalex.org/T13445 https://en.wikipedia.org/wiki/American_political_history https://openalex.org/subfields/3320 Political Science and International Relations https://openalex.org/fields/33 Social Sciences https://openalex.org/domains/2 Social Sciences

Data Notes Loaded 4,516 topics; columns: ids, names, text, keywords, siblings, counts, dates, taxonomy.

6) Canonicalization Strategy

Description Builds topics_canonical: stable topic_id, clean text, taxonomy IDs/names, counts, timestamps, language columns, de-duped by most recent update.

Input df_raw (normalized fields).

Output topics_canonical with \~24 columns, one row per topic.

Cleaning Objective A tidy, language-aware, audit-ready table keyed by topic_id.

Why data-driven Dedup/selection uses measurable signals (updated/created dates, lengths) and preserves provenance.

Python
def extract_token_from_uri(uri: Optional[str]) -> Optional[str]:
    """ 'https://openalex.org/T11881' -> 'T11881' ; '.../fields/25' -> '25' """
    if not uri or not isinstance(uri, str):
        return None
    m = re.search(r"/([A-Za-z0-9]+)$", uri.strip("/"))
    return m.group(1) if m else None

def to_int(x: Any) -> Optional[int]:
    try:
        if x is None or (isinstance(x, float) and math.isnan(x)):
            return None
        return int(x)
    except Exception:
        return None

def clean_text(s: Any) -> str:
    s = (str(s) if s is not None else "").strip()
    s = re.sub(r"\s+", " ", s)
    return s

RUN_ID = str(uuid.uuid4())

def canonicalize_topics(df: pd.DataFrame) -> pd.DataFrame:
    out = pd.DataFrame()
    out["topic_id"]      = df.get("id", pd.Series(index=df.index)).map(extract_token_from_uri)
    out["topic_url"]     = df.get("id", pd.Series(index=df.index)).astype(str)

    out["display_name"]  = df.get("display_name", pd.Series(index=df.index)).map(clean_text)
    out["description"]   = df.get("description",  pd.Series(index=df.index)).map(clean_text)

    # External IDs
    out["openalex_id"]   = df.get("ids.openalex",  pd.Series(index=df.index)).astype(str).replace({"nan": np.nan})
    out["wikipedia_url"] = df.get("ids.wikipedia", pd.Series(index=df.index)).astype(str).replace({"nan": np.nan})

    # Taxonomy (domain/field/subfield)
    out["domain_id"]     = df.get("domain.id",     pd.Series(index=df.index)).map(extract_token_from_uri)
    out["domain_name"]   = df.get("domain.display_name", pd.Series(index=df.index)).map(clean_text)
    out["field_id"]      = df.get("field.id",      pd.Series(index=df.index)).map(extract_token_from_uri)
    out["field_name"]    = df.get("field.display_name",  pd.Series(index=df.index)).map(clean_text)
    out["subfield_id"]   = df.get("subfield.id",   pd.Series(index=df.index)).map(extract_token_from_uri)
    out["subfield_name"] = df.get("subfield.display_name", pd.Series(index=df.index)).map(clean_text)

    # Metrics & dates
    out["works_count"]    = df.get("works_count",    pd.Series(index=df.index)).map(to_int)
    out["cited_by_count"] = df.get("cited_by_count", pd.Series(index=df.index)).map(to_int)
    out["created_dt"]     = pd.to_datetime(df.get("created_date", pd.Series(index=df.index)), errors="coerce", utc=True)
    out["updated_dt"]     = pd.to_datetime(df.get("updated_date", pd.Series(index=df.index)), errors="coerce", utc=True)

    # System/provenance
    out["ingest_run_id"] = RUN_ID
    out["source"]        = "openalex.topics"

    # Cleanups
    before = len(out)
    out = out.dropna(subset=["topic_id"])
    after = len(out)
    if before != after:
        print(f"Dropped {before - after} rows missing topic_id")

    # De-dup by topic_id (prefer latest updated_dt)
    out = out.sort_values(["updated_dt", "created_dt"], ascending=[False, False])
    out = out.drop_duplicates(subset=["topic_id"], keep="first")

    # QA helpers
    out["display_name_len"] = out["display_name"].str.len().fillna(0).astype(int)
    out["description_len"]  = out["description"].str.len().fillna(0).astype(int)

    # Language
    out = add_lang_cols(out, "display_name", "lang_title", "lang_title_score")
    out = add_lang_cols(out, "description",  "lang_desc",  "lang_desc_score")

    cols = [
        "topic_id","topic_url","display_name","description",
        "lang_title","lang_title_score","lang_desc","lang_desc_score",
        "works_count","cited_by_count",
        "domain_id","domain_name","field_id","field_name","subfield_id","subfield_name",
        "created_dt","updated_dt",
        "display_name_len","description_len",
        "openalex_id","wikipedia_url",
        "source","ingest_run_id",
    ]
    return out[[c for c in cols if c in out.columns]]

topics_canonical = canonicalize_topics(df_raw)
print("topics_canonical:", topics_canonical.shape)
display(topics_canonical.head(5))
Text Only
1
Lang via langid on 'display_name' → top codes:
count
lang_title
en 4215
unknown 79
de 62
fr 58
es 18
it 13
nl 12
da 9
pt 8
et 7
Text Only
1
Lang via langid on 'description' → top codes:
count
lang_desc
en 4516
Text Only
1
topics_canonical: (4516, 24)
topic_id topic_url display_name description lang_title lang_title_score lang_desc lang_desc_score works_count cited_by_count ... subfield_id subfield_name created_dt updated_dt display_name_len description_len openalex_id wikipedia_url source ingest_run_id
488 T10458 https://openalex.org/T10458 Bladder and Urothelial Cancer Treatments This cluster of papers focuses on the diagnosis, treatment, and management of bladder cancer, particularly urothelia... en -14.97 en -559.53 98312 1236380 ... 2746 Surgery 2024-01-23 00:00:00+00:00 2025-08-04 06:06:45.191053+00:00 40 400 https://openalex.org/T10458 https://en.wikipedia.org/wiki/Bladder_cancer openalex.topics ebe6f86e-8ade-490f-b40d-95b7981d6a72
839 T14028 https://openalex.org/T14028 Healthcare and Venom Research This cluster of papers explores the therapeutic potential of bee venom and its constituent compounds, particularly m... en -35.30 en -837.38 77723 216272 ... 2736 Pharmacology 2024-01-23 00:00:00+00:00 2025-08-04 06:06:45.002817+00:00 29 455 https://openalex.org/T14028 https://en.wikipedia.org/wiki/Bee_venom openalex.topics ebe6f86e-8ade-490f-b40d-95b7981d6a72
987 T12675 https://openalex.org/T12675 Financial Reporting and Valuation Research This cluster of papers focuses on value-based management, economic performance measurement, and corporate valuation.... en -30.12 en -633.92 71836 409158 ... 1408 Strategy and Management 2024-01-23 00:00:00+00:00 2025-08-04 06:06:44.792275+00:00 42 419 https://openalex.org/T12675 https://en.wikipedia.org/wiki/Value-based_management openalex.topics ebe6f86e-8ade-490f-b40d-95b7981d6a72
3717 T11920 https://openalex.org/T11920 Pulsed Power Technology Applications This cluster of papers focuses on the technology and applications of pulsed power, including electron beam surface m... en -11.21 en -374.55 14869 77455 ... 2207 Control and Systems Engineering 2024-01-23 00:00:00+00:00 2025-08-04 06:06:44.621627+00:00 36 269 https://openalex.org/T11920 https://en.wikipedia.org/wiki/Pulsed_power openalex.topics ebe6f86e-8ade-490f-b40d-95b7981d6a72
903 T14388 https://openalex.org/T14388 Medieval European History and Architecture This cluster of papers explores the evolution and influence of medieval architecture, particularly focusing on castl... en -44.18 en -700.27 75302 53317 ... 1204 Archeology 2024-01-23 00:00:00+00:00 2025-08-04 06:06:44.450834+00:00 42 363 https://openalex.org/T14388 https://en.wikipedia.org/wiki/Medieval_architecture openalex.topics ebe6f86e-8ade-490f-b40d-95b7981d6a72

5 rows × 24 columns

Data Notes Rows: 4,516. Title language: predominantly en; description language: all en.

7) Explode Keywords (M2M)

Description Extracts topic↔keyword pairs with a normalized slug and original surface form.

Input df_openalex_raw.keywords (lists).

Output topic_keywords_m2m(topic_id, keyword_norm, keyword_orig).

Cleaning Objective Normalize repeated tokens for search, joins, and scoring, while keeping display text.

Why data-driven Distinct counts and DF frequencies of keywords guide retrieval and enrichment.

Python
def build_topic_keywords(raw_df: pd.DataFrame) -> pd.DataFrame:
    pairs: List[Tuple[str, str, str]] = []   # (topic_id, keyword_norm, keyword_orig)
    for _, row in raw_df.iterrows():
        tid = extract_token_from_uri(row.get("id"))
        if not tid:
            continue
        kws = row.get("keywords") or []
        if not isinstance(kws, list):
            continue
        for kw in kws:
            if not isinstance(kw, str):
                continue
            kw_orig = clean_text(kw)
            kw_norm = slugify(kw_orig)
            if kw_norm:
                pairs.append((tid, kw_norm, kw_orig))

    m2m = pd.DataFrame(pairs, columns=["topic_id","keyword_norm","keyword_orig"]).drop_duplicates().reset_index(drop=True)
    return m2m

topic_keywords_m2m = build_topic_keywords(df_openalex_raw)
print("topic_keywords_m2m:", topic_keywords_m2m.shape)
display(topic_keywords_m2m.head(10))
Text Only
1
topic_keywords_m2m: (45154, 3)
topic_id keyword_norm keyword_orig
0 T11881 crystallization Crystallization
1 T11881 nucleation Nucleation
2 T11881 solubility Solubility
3 T11881 polymorphism Polymorphism
4 T11881 ultrasound-assisted-crystallization Ultrasound-Assisted Crystallization
5 T11881 process-analytical-technology Process Analytical Technology
6 T11881 crystal-growth Crystal Growth
7 T11881 pharmaceutical-crystallization Pharmaceutical Crystallization
8 T11881 continuous-crystallization Continuous Crystallization
9 T11881 crystal-engineering Crystal Engineering

Data Notes Pairs: 45,154; slugs are lowercase, punctuation-stripped, dash-joined.

8) Siblings (topic↔topic M2M)

Description Builds directed edges from each topic to its sibling topics.

Input df_openalex_raw.siblings (list of objects with id, display_name).

Output topic_siblings_m2m(topic_id, sibling_topic_id).

Cleaning Objective Capture topology for later graph analytics and GraphRAG.

Why data-driven Edges are derived solely from observed relationships.

Python
def build_topic_siblings(raw_df: pd.DataFrame) -> pd.DataFrame:
    edges: List[Tuple[str, str]] = []
    for _, row in raw_df.iterrows():
        tid = extract_token_from_uri(row.get("id"))
        if not tid:
            continue
        sibs = row.get("siblings") or []
        if not isinstance(sibs, list):
            continue
        for s in sibs:
            sid = extract_token_from_uri(s.get("id")) if isinstance(s, dict) else None
            if sid and sid != tid:
                edges.append((tid, sid))
    df = pd.DataFrame(edges, columns=["topic_id","sibling_topic_id"]).drop_duplicates().reset_index(drop=True)
    return df

topic_siblings_m2m = build_topic_siblings(df_openalex_raw)
print("topic_siblings_m2m:", topic_siblings_m2m.shape)
display(topic_siblings_m2m.head(10))
Text Only
1
topic_siblings_m2m: (243006, 2)
topic_id sibling_topic_id
0 T11881 T10275
1 T11881 T13889
2 T11881 T12302
3 T11881 T10440
4 T11881 T10311
5 T11881 T12340
6 T11881 T11471
7 T11881 T11341
8 T11881 T11342
9 T11881 T10074

Data Notes Edges: 243,006 (de-duplicated). Direction is topic→sibling (treat as undirected in graph if needed).

9) Taxonomy (Domains, Fields, Subfields)

Description Emits compact reference tables for domain/field/subfield IDs and names.

Input topics_canonical taxonomy columns.

Output domains_ref, fields_ref, subfields_ref.

Cleaning Objective Provide clean lookup tables to join/crosstab efficiently.

Why data-driven Refs are learned from actual data, not hard-coded lists.

Python
def build_taxonomy(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    domains = (df[["domain_id","domain_name"]]
               .dropna(subset=["domain_id"]).drop_duplicates()
               .rename(columns={"domain_id":"id","domain_name":"name"})
               .reset_index(drop=True))

    fields  = (df[["field_id","field_name","domain_id"]]
               .dropna(subset=["field_id"]).drop_duplicates()
               .rename(columns={"field_id":"id","field_name":"name"})
               .reset_index(drop=True))

    subs    = (df[["subfield_id","subfield_name","field_id","domain_id"]]
               .dropna(subset=["subfield_id"]).drop_duplicates()
               .rename(columns={"subfield_id":"id","subfield_name":"name"})
               .reset_index(drop=True))

    return domains, fields, subs

domains_ref, fields_ref, subfields_ref = build_taxonomy(topics_canonical)
print("domains_ref :", domains_ref.shape)
print("fields_ref  :", fields_ref.shape)
print("subfields_ref:", subfields_ref.shape)
display(domains_ref.head(10))
display(fields_ref.head(10))
display(subfields_ref.head(10))
Text Only
1
2
3
domains_ref : (4, 2)
fields_ref  : (26, 3)
subfields_ref: (252, 4)
id name
0 4 Health Sciences
1 2 Social Sciences
2 3 Physical Sciences
3 1 Life Sciences
id name domain_id
0 27 Medicine 4
1 14 Business, Management and Accounting 2
2 22 Engineering 3
3 12 Arts and Humanities 2
4 23 Environmental Science 3
5 11 Agricultural and Biological Sciences 1
6 30 Pharmacology, Toxicology and Pharmaceutics 1
7 33 Social Sciences 2
8 26 Mathematics 3
9 31 Physics and Astronomy 3
id name field_id domain_id
0 2746 Surgery 27 4
1 2736 Pharmacology 27 4
2 1408 Strategy and Management 14 2
3 2207 Control and Systems Engineering 22 3
4 1204 Archeology 12 2
5 2308 Management, Monitoring, Policy and Law 23 3
6 1105 Ecology, Evolution, Behavior and Systematics 11 1
7 2728 Neurology 27 4
8 3003 Pharmaceutical Science 30 1
9 2202 Aerospace Engineering 22 3

Data Notes Domains: 4, Fields: 26, Subfields: 252.

10) QA Slices & EDA

Description Null coverage, description-length stats, and top domain/field/subfield aggregations; language distributions.

Input topics_canonical.

Output Printed summaries/dataframes for quick human QA.

Cleaning Objective Detect coverage gaps, outliers, and skew that might require rules.

Python
def null_coverage(df: pd.DataFrame) -> pd.DataFrame:
    return (df.isna().mean().rename("null_frac").to_frame()
            .assign(non_null_frac=lambda x: 1 - x["null_frac"])
            .sort_values("null_frac", ascending=False))

print("Canonical rows:", len(topics_canonical))
display(null_coverage(topics_canonical).head(20))

desc_len_summary = topics_canonical["description_len"].describe(percentiles=[0.25,0.5,0.75,0.9,0.95]).to_frame("description_len")
display(desc_len_summary)

def top_groups(df: pd.DataFrame, group_col: str, k: int = 15):
    grp = (df.groupby([group_col], dropna=True, as_index=False)
             .agg(topics=("topic_id","count"),
                  works=("works_count","sum"),
                  cited=("cited_by_count","sum"))
             .sort_values(["works","topics"], ascending=[False, False])
             .head(k))
    return grp

print("Top domains:")
display(top_groups(topics_canonical, "domain_name"))

print("Top fields:")
display(top_groups(topics_canonical, "field_name"))

print("Top subfields:")
display(top_groups(topics_canonical, "subfield_name"))

print("Language (title):")
display(topics_canonical["lang_title"].value_counts().head(10).to_frame("count"))

print("Language (description):")
display(topics_canonical["lang_desc"].value_counts().head(10).to_frame("count"))
Text Only
1
Canonical rows: 4516
null_frac non_null_frac
topic_id 0.00 1.00
topic_url 0.00 1.00
display_name 0.00 1.00
description 0.00 1.00
lang_title 0.00 1.00
lang_title_score 0.00 1.00
lang_desc 0.00 1.00
lang_desc_score 0.00 1.00
works_count 0.00 1.00
cited_by_count 0.00 1.00
domain_id 0.00 1.00
domain_name 0.00 1.00
field_id 0.00 1.00
field_name 0.00 1.00
subfield_id 0.00 1.00
subfield_name 0.00 1.00
created_dt 0.00 1.00
updated_dt 0.00 1.00
display_name_len 0.00 1.00
description_len 0.00 1.00
description_len
count 4,516.00
mean 383.54
std 72.35
min 192.00
25% 330.75
50% 378.00
75% 430.00
90% 479.00
95% 510.00
max 828.00
Text Only
1
Top domains:
domain_name topics works cited
2 Physical Sciences 1571 79595737 1029880409
3 Social Sciences 1487 69110886 456769072
0 Health Sciences 844 46865631 619762736
1 Life Sciences 614 29206813 556886623
Text Only
1
Top fields:
field_name topics works cited
18 Medicine 692 38667454 551729528
24 Social Sciences 764 34130674 174061630
12 Engineering 560 26868545 275617213
1 Arts and Humanities 266 14120823 49066260
6 Computer Science 302 13155685 146971048
2 Biochemistry, Genetics and Molecular Biology 248 12010530 284829406
0 Agricultural and Biological Sciences 235 10958538 122928997
13 Environmental Science 202 9896376 144116583
22 Physics and Astronomy 104 7505028 118302142
16 Materials Science 123 7328878 120703476
3 Business, Management and Accounting 146 6648057 58772091
10 Economics, Econometrics and Finance 107 6075240 54959631
23 Psychology 144 6066953 91163020
14 Health Professions 107 5988827 37002215
5 Chemistry 101 5417164 81827664
Text Only
1
Top subfields:
subfield_name topics works cited
221 Sociology and Political Science 224 9751449 59687942
161 Molecular Biology 135 6461134 174270736
198 Political Science and International Relations 118 6362003 24963096
72 Education 137 5740594 27751204
73 Electrical and Electronic Engineering 117 5696298 71704839
233 Surgery 96 4708348 54081133
151 Materials Chemistry 69 4433541 67064151
71 Economics and Econometrics 79 4221741 35483020
197 Plant Science 90 4082168 53236098
17 Artificial Intelligence 77 3527534 49031495
103 General Health Professions 50 3479281 21346642
153 Mechanical Engineering 79 3278289 30943503
27 Biomedical Engineering 72 3138035 50719592
204 Public Health, Environmental and Occupational Health 54 3065979 32451151
132 Information Systems 78 2989948 15376972
Text Only
1
Language (title):
count
lang_title
en 4215
unknown 79
de 62
fr 58
es 18
it 13
nl 12
da 9
pt 8
et 7
Text Only
1
Language (description):
count
lang_desc
en 4516

Data Notes No nulls in core columns. Description length median ≈ 378 chars (95th ≈ 510). Top domains: Physical & Social Sciences; top fields include Medicine, Social Sciences, Engineering.

11) Save Silver Drafts + Audits (aligned dirs)

Description* Persists all Silver tables to silver/openalex/<bucket>/… and writes small audit CSVs to reports/exploration.

Input topics_canonical, topic_keywords_m2m, topic_siblings_m2m, taxonomy refs.

Output Parquet files (or CSV.GZ fallback) + audit CSVs.

Cleaning Objective Durable, analyzable artifacts for downstream pipelines and reviewers.

Python
p_topics    = safe_save(topics_canonical,   f"topics_canonical_{len(topics_canonical)}")
p_kw_m2m    = safe_save(topic_keywords_m2m, f"topic_keywords_m2m_{len(topic_keywords_m2m)}")
p_sib_m2m   = safe_save(topic_siblings_m2m, f"topic_siblings_m2m_{len(topic_siblings_m2m)}")
p_domains   = safe_save(domains_ref,        f"domains_ref_{len(domains_ref)}")
p_fields    = safe_save(fields_ref,         f"fields_ref_{len(fields_ref)}")
p_subfields = safe_save(subfields_ref,      f"subfields_ref_{len(subfields_ref)}")

# Audit CSVs → same reports/exploration folder used elsewhere
(topics_canonical[[
    "topic_id","display_name","domain_name","field_name","subfield_name",
    "works_count","cited_by_count","lang_title","lang_desc","updated_dt"
]].head(50).to_csv(REPORTS_DIR / "openalex_topics_head50.csv", index=False))

(topic_keywords_m2m.head(200).to_csv(REPORTS_DIR / "openalex_topic_keywords_head200.csv", index=False))
(topic_siblings_m2m.head(200).to_csv(REPORTS_DIR / "openalex_topic_siblings_head200.csv", index=False))

print("Audit CSVs →", REPORTS_DIR)
print("Silver OpenAlex written to →", SILVER_OPENALEX_DIR)
Text Only
1
2
3
4
5
6
7
8
Saved Parquet → topics_canonical_4516.parquet  rows=4,516
Saved Parquet → topic_keywords_m2m_45154.parquet  rows=45,154
Saved Parquet → topic_siblings_m2m_243006.parquet  rows=243,006
Saved Parquet → domains_ref_4.parquet  rows=4
Saved Parquet → fields_ref_26.parquet  rows=26
Saved Parquet → subfields_ref_252.parquet  rows=252
Audit CSVs → C:\Users\gerbe\PycharmProjects\MT\notebooks\reports\exploration
Silver OpenAlex written to → C:\Users\gerbe\PycharmProjects\MT\notebooks\cleantech_data\silver\openalex\2025-08-09

12) (Optional) Normalized Keyword Dictionary

Description Creates a DF of unique keyword_norm with doc frequency and an example surface form.

Input topic_keywords_m2m.

Output keyword_dictionary (norm, df_count, example).

Cleaning Objective Enable quick filtering/stopwording and build BM25/dense vocabularies.

Why data-driven Frequency-based pruning and weighting perform better than ad-hoc lists.

Python
1
2
3
4
5
6
7
8
kw_dict = (topic_keywords_m2m
           .groupby(["keyword_norm"], as_index=False)
           .agg(df_count=("topic_id","count"),
                example=("keyword_orig","first"))
           .sort_values("df_count", ascending=False))

display(kw_dict.head(30))
safe_save(kw_dict, f"keyword_dictionary_{len(kw_dict)}")
keyword_norm df_count example
10049 globalization 186 Globalization
6927 education 161 Education
7827 epidemiology 140 Epidemiology
24240 sustainability 140 Sustainability
3781 climate-change 132 Climate Change
... ... ... ...
25570 treatment 52 Treatment
4176 colonialism 52 Colonialism
1076 antioxidant 51 antioxidant
14832 mental-health 48 Mental Health
7001 educational-technology 48 Educational Technology

30 rows × 3 columns

Text Only
1
2
3
4
5
6
7
Saved Parquet → keyword_dictionary_27180.parquet  rows=27,180





WindowsPath('C:/Users/gerbe/PycharmProjects/MT/notebooks/cleantech_data/silver/openalex/2025-08-09/keyword_dictionary_27180.parquet')

Data Notes Rows: 27,180; top items include globalization, education, epidemiology, sustainability, climate-change.

13) Final Checks

Description Prints the final shape snapshot of every produced table and the Silver directory.

Input All in-memory Silver DFs.

Output Counts for canonical, M2M, refs, and final path.

Python
1
2
3
4
5
6
7
8
print("=== FINAL SNAPSHOT ===")
print("topics_canonical     :", topics_canonical.shape)
print("topic_keywords_m2m   :", topic_keywords_m2m.shape)
print("topic_siblings_m2m   :", topic_siblings_m2m.shape)
print("domains_ref          :", domains_ref.shape)
print("fields_ref           :", fields_ref.shape)
print("subfields_ref        :", subfields_ref.shape)
print("Silver dir           :", SILVER_OPENALEX_DIR)
Text Only
1
2
3
4
5
6
7
8
=== FINAL SNAPSHOT ===
topics_canonical     : (4516, 24)
topic_keywords_m2m   : (45154, 3)
topic_siblings_m2m   : (243006, 2)
domains_ref          : (4, 2)
fields_ref           : (26, 3)
subfields_ref        : (252, 4)
Silver dir           : C:\Users\gerbe\PycharmProjects\MT\notebooks\cleantech_data\silver\openalex\2025-08-09

Data Notes topics_canonical 4,516; topic_keywords_m2m 45,154; topic_siblings_m2m 243,006; refs (4 / 26 / 252); Silver dir confirmed.