This commit is contained in:
Michał Flak
2026-02-24 02:36:30 +01:00
parent f8a93a5d42
commit 51120ec6f2

View File

@@ -3,7 +3,6 @@
# dependencies = [
# "marimo",
# "httpx",
# "polars",
# ]
# ///
@@ -25,9 +24,7 @@ def _():
import time
from pathlib import Path
import polars as pl
return Path, csv, hashlib, httpx, io, json, mo, os, pl, time
return Path, csv, hashlib, httpx, io, json, mo, os, time
@app.cell(hide_code=True)
@@ -138,71 +135,6 @@ def _(
return (people,)
@app.cell(hide_code=True)
def _():
def extract_apollo(data):
p = data.get("person") or {}
org = p.get("organization") or {}
loc = [p.get("city"), p.get("state"), p.get("country")]
return {
"name": p.get("name") or "",
"title": p.get("title") or "",
"company": org.get("name") or "",
"industry": org.get("industry") or "",
"location": ", ".join(x for x in loc if x),
"linkedin": p.get("linkedin_url") or "",
"phones": ", ".join(p.get("phone_numbers") or []),
"found_emails": p.get("email") or "",
}
def extract_pdl(data):
d = data.get("data") or data
phones = d.get("mobile_phone") or ""
if not phones and d.get("phone_numbers"):
phones = ", ".join(d["phone_numbers"][:3])
emails_parts = []
if d.get("work_email"):
emails_parts.append(d["work_email"])
if d.get("personal_emails"):
emails_parts.extend(d["personal_emails"][:2])
return {
"name": d.get("full_name") or "",
"title": d.get("job_title") or "",
"company": d.get("job_company_name") or "",
"industry": d.get("job_company_industry") or "",
"location": d.get("location_name") or "",
"linkedin": d.get("linkedin_url") or "",
"phones": phones,
"found_emails": ", ".join(emails_parts),
}
def extract_fullenrich(data):
ci = data.get("contact_info") or {}
prof = data.get("profile") or {}
inp = data.get("input") or {}
emails_parts = []
if ci.get("most_probable_work_email"):
emails_parts.append(ci["most_probable_work_email"])
if ci.get("work_emails"):
for e in ci["work_emails"]:
if e not in emails_parts:
emails_parts.append(e)
if ci.get("personal_email"):
emails_parts.append(ci["personal_email"])
return {
"name": prof.get("full_name") or "",
"title": prof.get("headline") or "",
"company": inp.get("company_name") or "",
"industry": "",
"location": prof.get("location") or "",
"linkedin": inp.get("linkedin_url") or "",
"phones": ", ".join(ci.get("phones") or []),
"found_emails": ", ".join(emails_parts),
}
return extract_apollo, extract_fullenrich, extract_pdl
@app.cell(hide_code=True)
def _(mo, people, apollo_key, httpx, read_cache, write_cache):
apollo_results = {}
@@ -257,10 +189,19 @@ def _(mo, people, pdl_key, httpx, read_cache, write_cache):
pdl_results[_email] = _cached
continue
try:
_params = {"email": _email, "min_likelihood": 5}
if _person.get("first_name"):
_params["first_name"] = _person["first_name"]
if _person.get("last_name"):
_params["last_name"] = _person["last_name"]
if _person.get("linkedin_url"):
_params["profile"] = _person["linkedin_url"]
if _person.get("domain"):
_params["website"] = _person["domain"]
_r = httpx.get(
"https://api.peopledatalabs.com/v5/person/enrich",
headers={"X-Api-Key": pdl_key.value},
params={"email": _email, "min_likelihood": 5},
params=_params,
timeout=30,
)
_data = _r.json()
@@ -302,10 +243,10 @@ def _(mo, people, fullenrich_key, httpx, read_cache, write_cache, time):
"custom": {"email": _person["email"]},
}
_has_id = False
if _person.get("first_name") and _person.get("last_name"):
if _person.get("first_name") and _person.get("last_name") and _person.get("domain"):
_entry["first_name"] = _person["first_name"]
_entry["last_name"] = _person["last_name"]
_entry["domain"] = _person.get("domain") or _person["email"].split("@")[1]
_entry["domain"] = _person["domain"]
_has_id = True
if _person.get("linkedin_url"):
_entry["linkedin_url"] = _person["linkedin_url"]
@@ -366,40 +307,13 @@ def _(mo, people, fullenrich_key, httpx, read_cache, write_cache, time):
return (fullenrich_results,)
@app.cell(hide_code=True)
def _(
mo, people, apollo_results, pdl_results, fullenrich_results,
extract_apollo, extract_pdl, extract_fullenrich, pl,
):
_rows = []
for _person in people:
_email = _person["email"]
for _provider, _results, _extractor in [
("Apollo", apollo_results, extract_apollo),
("PDL", pdl_results, extract_pdl),
("FullEnrich", fullenrich_results, extract_fullenrich),
]:
if _email in _results and "error" not in _results[_email]:
_extracted = _extractor(_results[_email])
_rows.append({"email": _email, "provider": _provider, **_extracted})
comparison_df = pl.DataFrame(_rows) if _rows else None
if comparison_df is not None:
mo.vstack([mo.md("## Comparison"), mo.ui.table(comparison_df)])
else:
mo.md("## Comparison\n\n*No results yet*")
return (comparison_df,)
@app.cell(hide_code=True)
def _(mo, apollo_results, pdl_results, fullenrich_results, json):
def _fmt(d):
return mo.md(f"```json\n{json.dumps(d, indent=2, default=str)}\n```")
mo.vstack([
mo.md("## Raw Results"),
mo.md("## Results"),
mo.ui.tabs({
"Apollo": _fmt(apollo_results),
"PDL": _fmt(pdl_results),
@@ -410,24 +324,17 @@ def _(mo, apollo_results, pdl_results, fullenrich_results, json):
@app.cell(hide_code=True)
def _(mo, comparison_df, apollo_results, pdl_results, fullenrich_results, json):
_items = []
if comparison_df is not None:
_csv_bytes = comparison_df.write_csv().encode()
_items.append(
mo.download(_csv_bytes, filename="enrichment_comparison.csv", label="Download CSV")
)
def _(mo, apollo_results, pdl_results, fullenrich_results, json):
_raw = json.dumps(
{"apollo": apollo_results, "pdl": pdl_results, "fullenrich": fullenrich_results},
indent=2,
default=str,
).encode()
_items.append(
mo.download(_raw, filename="enrichment_raw.json", label="Download Raw JSON")
)
mo.vstack([mo.md("## Export"), mo.hstack(_items)])
mo.vstack([
mo.md("## Export"),
mo.download(_raw, filename="enrichment_raw.json", label="Download Raw JSON"),
])
return