This commit is contained in:
Michał Flak
2026-02-24 02:36:30 +01:00
parent f8a93a5d42
commit 51120ec6f2

View File

@@ -3,7 +3,6 @@
# dependencies = [ # dependencies = [
# "marimo", # "marimo",
# "httpx", # "httpx",
# "polars",
# ] # ]
# /// # ///
@@ -25,9 +24,7 @@ def _():
import time import time
from pathlib import Path from pathlib import Path
import polars as pl return Path, csv, hashlib, httpx, io, json, mo, os, time
return Path, csv, hashlib, httpx, io, json, mo, os, pl, time
@app.cell(hide_code=True) @app.cell(hide_code=True)
@@ -138,71 +135,6 @@ def _(
return (people,) return (people,)
@app.cell(hide_code=True)
def _():
def extract_apollo(data):
p = data.get("person") or {}
org = p.get("organization") or {}
loc = [p.get("city"), p.get("state"), p.get("country")]
return {
"name": p.get("name") or "",
"title": p.get("title") or "",
"company": org.get("name") or "",
"industry": org.get("industry") or "",
"location": ", ".join(x for x in loc if x),
"linkedin": p.get("linkedin_url") or "",
"phones": ", ".join(p.get("phone_numbers") or []),
"found_emails": p.get("email") or "",
}
def extract_pdl(data):
d = data.get("data") or data
phones = d.get("mobile_phone") or ""
if not phones and d.get("phone_numbers"):
phones = ", ".join(d["phone_numbers"][:3])
emails_parts = []
if d.get("work_email"):
emails_parts.append(d["work_email"])
if d.get("personal_emails"):
emails_parts.extend(d["personal_emails"][:2])
return {
"name": d.get("full_name") or "",
"title": d.get("job_title") or "",
"company": d.get("job_company_name") or "",
"industry": d.get("job_company_industry") or "",
"location": d.get("location_name") or "",
"linkedin": d.get("linkedin_url") or "",
"phones": phones,
"found_emails": ", ".join(emails_parts),
}
def extract_fullenrich(data):
ci = data.get("contact_info") or {}
prof = data.get("profile") or {}
inp = data.get("input") or {}
emails_parts = []
if ci.get("most_probable_work_email"):
emails_parts.append(ci["most_probable_work_email"])
if ci.get("work_emails"):
for e in ci["work_emails"]:
if e not in emails_parts:
emails_parts.append(e)
if ci.get("personal_email"):
emails_parts.append(ci["personal_email"])
return {
"name": prof.get("full_name") or "",
"title": prof.get("headline") or "",
"company": inp.get("company_name") or "",
"industry": "",
"location": prof.get("location") or "",
"linkedin": inp.get("linkedin_url") or "",
"phones": ", ".join(ci.get("phones") or []),
"found_emails": ", ".join(emails_parts),
}
return extract_apollo, extract_fullenrich, extract_pdl
@app.cell(hide_code=True) @app.cell(hide_code=True)
def _(mo, people, apollo_key, httpx, read_cache, write_cache): def _(mo, people, apollo_key, httpx, read_cache, write_cache):
apollo_results = {} apollo_results = {}
@@ -257,10 +189,19 @@ def _(mo, people, pdl_key, httpx, read_cache, write_cache):
pdl_results[_email] = _cached pdl_results[_email] = _cached
continue continue
try: try:
_params = {"email": _email, "min_likelihood": 5}
if _person.get("first_name"):
_params["first_name"] = _person["first_name"]
if _person.get("last_name"):
_params["last_name"] = _person["last_name"]
if _person.get("linkedin_url"):
_params["profile"] = _person["linkedin_url"]
if _person.get("domain"):
_params["website"] = _person["domain"]
_r = httpx.get( _r = httpx.get(
"https://api.peopledatalabs.com/v5/person/enrich", "https://api.peopledatalabs.com/v5/person/enrich",
headers={"X-Api-Key": pdl_key.value}, headers={"X-Api-Key": pdl_key.value},
params={"email": _email, "min_likelihood": 5}, params=_params,
timeout=30, timeout=30,
) )
_data = _r.json() _data = _r.json()
@@ -302,10 +243,10 @@ def _(mo, people, fullenrich_key, httpx, read_cache, write_cache, time):
"custom": {"email": _person["email"]}, "custom": {"email": _person["email"]},
} }
_has_id = False _has_id = False
if _person.get("first_name") and _person.get("last_name"): if _person.get("first_name") and _person.get("last_name") and _person.get("domain"):
_entry["first_name"] = _person["first_name"] _entry["first_name"] = _person["first_name"]
_entry["last_name"] = _person["last_name"] _entry["last_name"] = _person["last_name"]
_entry["domain"] = _person.get("domain") or _person["email"].split("@")[1] _entry["domain"] = _person["domain"]
_has_id = True _has_id = True
if _person.get("linkedin_url"): if _person.get("linkedin_url"):
_entry["linkedin_url"] = _person["linkedin_url"] _entry["linkedin_url"] = _person["linkedin_url"]
@@ -366,40 +307,13 @@ def _(mo, people, fullenrich_key, httpx, read_cache, write_cache, time):
return (fullenrich_results,) return (fullenrich_results,)
@app.cell(hide_code=True)
def _(
mo, people, apollo_results, pdl_results, fullenrich_results,
extract_apollo, extract_pdl, extract_fullenrich, pl,
):
_rows = []
for _person in people:
_email = _person["email"]
for _provider, _results, _extractor in [
("Apollo", apollo_results, extract_apollo),
("PDL", pdl_results, extract_pdl),
("FullEnrich", fullenrich_results, extract_fullenrich),
]:
if _email in _results and "error" not in _results[_email]:
_extracted = _extractor(_results[_email])
_rows.append({"email": _email, "provider": _provider, **_extracted})
comparison_df = pl.DataFrame(_rows) if _rows else None
if comparison_df is not None:
mo.vstack([mo.md("## Comparison"), mo.ui.table(comparison_df)])
else:
mo.md("## Comparison\n\n*No results yet*")
return (comparison_df,)
@app.cell(hide_code=True) @app.cell(hide_code=True)
def _(mo, apollo_results, pdl_results, fullenrich_results, json): def _(mo, apollo_results, pdl_results, fullenrich_results, json):
def _fmt(d): def _fmt(d):
return mo.md(f"```json\n{json.dumps(d, indent=2, default=str)}\n```") return mo.md(f"```json\n{json.dumps(d, indent=2, default=str)}\n```")
mo.vstack([ mo.vstack([
mo.md("## Raw Results"), mo.md("## Results"),
mo.ui.tabs({ mo.ui.tabs({
"Apollo": _fmt(apollo_results), "Apollo": _fmt(apollo_results),
"PDL": _fmt(pdl_results), "PDL": _fmt(pdl_results),
@@ -410,24 +324,17 @@ def _(mo, apollo_results, pdl_results, fullenrich_results, json):
@app.cell(hide_code=True) @app.cell(hide_code=True)
def _(mo, comparison_df, apollo_results, pdl_results, fullenrich_results, json): def _(mo, apollo_results, pdl_results, fullenrich_results, json):
_items = []
if comparison_df is not None:
_csv_bytes = comparison_df.write_csv().encode()
_items.append(
mo.download(_csv_bytes, filename="enrichment_comparison.csv", label="Download CSV")
)
_raw = json.dumps( _raw = json.dumps(
{"apollo": apollo_results, "pdl": pdl_results, "fullenrich": fullenrich_results}, {"apollo": apollo_results, "pdl": pdl_results, "fullenrich": fullenrich_results},
indent=2, indent=2,
default=str, default=str,
).encode() ).encode()
_items.append(
mo.download(_raw, filename="enrichment_raw.json", label="Download Raw JSON")
)
mo.vstack([mo.md("## Export"), mo.hstack(_items)]) mo.vstack([
mo.md("## Export"),
mo.download(_raw, filename="enrichment_raw.json", label="Download Raw JSON"),
])
return return