This notebook creates a stratified subsample of cleantech_data/silver/unified/unified_docs.parquet,
validates representativeness (simple metrics) and writes the sample to cleantech_data/silver_subsample/.
Only the media and patent documents are sub-sampled; OpenAlex topics are kept in full.
frompathlibimportPathimportnumpyasnpimportpandasaspdimportmatplotlib.pyplotaspltimportsys,pathlib# from a notebook inside MT/notebooks/sys.path.append(str(pathlib.Path.cwd().parent/"src"/"pipeline"))# or simply: sys.path.append("../src/pipeline")fromsubsampleimportpathsfromsubsample.samplerimportstratified_sample,evaluate_distributionpd.set_option('display.max_columns',0)BY=['doc_type','lang']FRAC=0.05# 5% defaultSEED=42in_path=paths.default_input_path()print('[notebook] reading:',in_path)df=pd.read_parquet(in_path)ifin_path.suffix.lower()in{'.parquet','.pq'}elsepd.read_csv(in_path)len(df)
# --- doc_type distribution ---pop_counts=df['doc_type'].fillna('unknown').astype(str).value_counts().sort_index()print(pop_counts)plt.figure()pop_counts.plot(kind='bar',rot=45)plt.title('Population: doc_type counts')plt.tight_layout()plt.show()# --- language distribution ---lang_counts=df['lang'].fillna('unknown').astype(str).value_counts().head(15)plt.figure()lang_counts.plot(kind='bar',rot=45)plt.title('Population: top languages')plt.tight_layout()plt.show()# --- monthly date histogram ---if'date'indf.columns:months=pd.to_datetime(df['date'],errors='coerce').dt.to_period('M').astype(str)mon_counts=months.value_counts().sort_index()plt.figure()mon_counts.plot(kind='line')plt.title('Population: items per month')plt.tight_layout()plt.show()# --- text length (quick proxy) ---if'text'indf.columns:length=df['text'].astype(str).str.len()plt.figure()length.clip(upper=20000).plot(kind='hist',bins=50)plt.title('Population: text length (clipped)')plt.tight_layout()plt.show()
[notebook] sample rows: 7051
doc_type lang
patent en 1444
media en 967
patent zh 111
ko 7
fr 5
de 1
ja 0
pl 0
ar 0
ru 0
pt 0
media la 0
de 0
patent cs 0
media es 0
it 0
ru 0
patent es 0
lv 0
ro 0
Name: size, dtype: int64