Subsample Unified Docs (Stratified)¶

This notebook creates a stratified subsample of cleantech_data/silver/unified/unified_docs.parquet, validates representativeness (simple metrics) and writes the sample to cleantech_data/silver_subsample/. Only the media and patent documents are sub-sampled; OpenAlex topics are kept in full.

Python
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys, pathlib

# from a notebook inside MT/notebooks/
sys.path.append(str(pathlib.Path.cwd().parent / "src" / "pipeline"))
# or simply: sys.path.append("../src/pipeline")

from subsample import paths

from subsample.sampler import stratified_sample, evaluate_distribution

pd.set_option('display.max_columns', 0)
BY = ['doc_type', 'lang']
FRAC = 0.05  # 5% default
SEED = 42

in_path = paths.default_input_path()
print('[notebook] reading:', in_path)
df = pd.read_parquet(in_path) if in_path.suffix.lower() in {'.parquet', '.pq'} else pd.read_csv(in_path)
len(df)

Text Only
[notebook] reading: C:\Users\gerbe\PycharmProjects\MT\cleantech_data\silver\unified\unified_docs.parquet

55221

Population snapshots¶

Python
# --- doc_type distribution ---
pop_counts = df['doc_type'].fillna('unknown').astype(str).value_counts().sort_index()
print(pop_counts)
plt.figure()
pop_counts.plot(kind='bar', rot=45)
plt.title('Population: doc_type counts')
plt.tight_layout()
plt.show()

# --- language distribution ---
lang_counts = df['lang'].fillna('unknown').astype(str).value_counts().head(15)
plt.figure()
lang_counts.plot(kind='bar', rot=45)
plt.title('Population: top languages')
plt.tight_layout()
plt.show()

# --- monthly date histogram ---
if 'date' in df.columns:
    months = pd.to_datetime(df['date'], errors='coerce').dt.to_period('M').astype(str)
    mon_counts = months.value_counts().sort_index()
    plt.figure()
    mon_counts.plot(kind='line')
    plt.title('Population: items per month')
    plt.tight_layout()
    plt.show()

# --- text length (quick proxy) ---
if 'text' in df.columns:
    length = df['text'].astype(str).str.len()
    plt.figure()
    length.clip(upper=20000).plot(kind='hist', bins=50)
    plt.title('Population: text length (clipped)')
    plt.tight_layout()
    plt.show()

Text Only
doc_type
media     19341
patent    31364
topic      4516
Name: count, dtype: int64

png

Draw stratified sample¶

Python
keep_mask = df['doc_type'] == 'topic'
keep = df[keep_mask]
to_sample = df[~keep_mask]
sample, alloc = stratified_sample(
    to_sample, BY, frac=FRAC, seed=SEED, min_per_stratum=0,
    dedupe_on='doc_id' if 'doc_id' in to_sample.columns else None,
)
sample = pd.concat([sample, keep], ignore_index=True)
print('[notebook] sample rows:', len(sample))
alloc.head(20)

Text Only
[notebook] sample rows: 7051





doc_type  lang
patent    en      1444
media     en       967
patent    zh       111
          ko         7
          fr         5
          de         1
          ja         0
          pl         0
          ar         0
          ru         0
          pt         0
media     la         0
          de         0
patent    cs         0
media     es         0
          it         0
          ru         0
patent    es         0
          lv         0
          ro         0
Name: size, dtype: int64

Compare distributions (TV distance)¶

Python
metrics = evaluate_distribution(df, sample, categorical=tuple([c for c in BY if c in df.columns]))
metrics

Text Only
{'tv_doc_type': np.float64(0.5586960460890519),
 'tv_lang': np.float64(0.02754160137513329),
 'tv_date_month': np.float64(0.5471905694977584)}

Python
# Side-by-side doc_type counts
samp_counts = sample['doc_type'].fillna('unknown').astype(str).value_counts().sort_index()
idx = pop_counts.index.union(samp_counts.index)
cmp = pd.DataFrame({'population': pop_counts.reindex(idx).fillna(0), 'sample': samp_counts.reindex(idx).fillna(0)}).astype(int)
cmp

	population	sample
doc_type
media	19341	967
patent	31364	1568
topic	4516	4516

Python
# Plot sample vs population (doc_type)
plt.figure()
cmp.plot(kind='bar', rot=45)
plt.title('Doc Type: population vs sample')
plt.tight_layout()
plt.show()

Text Only
<Figure size 640x480 with 0 Axes>

png

Save subsample¶

Python
out_path = paths.default_output_path()
sample.to_parquet(out_path, index=False)
print('[notebook] wrote ->', out_path)

Text Only
[notebook] wrote -> C:\Users\gerbe\PycharmProjects\MT\cleantech_data\silver_subsample\unified_docs_subsample.parquet