diff --git a/comparison.py b/comparison.py index ddfb9c3..761bc8c 100644 --- a/comparison.py +++ b/comparison.py @@ -3,7 +3,6 @@ # dependencies = [ # "marimo", # "httpx", -# "polars", # ] # /// @@ -25,9 +24,7 @@ def _(): import time from pathlib import Path - import polars as pl - - return Path, csv, hashlib, httpx, io, json, mo, os, pl, time + return Path, csv, hashlib, httpx, io, json, mo, os, time @app.cell(hide_code=True) @@ -138,71 +135,6 @@ def _( return (people,) -@app.cell(hide_code=True) -def _(): - def extract_apollo(data): - p = data.get("person") or {} - org = p.get("organization") or {} - loc = [p.get("city"), p.get("state"), p.get("country")] - return { - "name": p.get("name") or "", - "title": p.get("title") or "", - "company": org.get("name") or "", - "industry": org.get("industry") or "", - "location": ", ".join(x for x in loc if x), - "linkedin": p.get("linkedin_url") or "", - "phones": ", ".join(p.get("phone_numbers") or []), - "found_emails": p.get("email") or "", - } - - def extract_pdl(data): - d = data.get("data") or data - phones = d.get("mobile_phone") or "" - if not phones and d.get("phone_numbers"): - phones = ", ".join(d["phone_numbers"][:3]) - emails_parts = [] - if d.get("work_email"): - emails_parts.append(d["work_email"]) - if d.get("personal_emails"): - emails_parts.extend(d["personal_emails"][:2]) - return { - "name": d.get("full_name") or "", - "title": d.get("job_title") or "", - "company": d.get("job_company_name") or "", - "industry": d.get("job_company_industry") or "", - "location": d.get("location_name") or "", - "linkedin": d.get("linkedin_url") or "", - "phones": phones, - "found_emails": ", ".join(emails_parts), - } - - def extract_fullenrich(data): - ci = data.get("contact_info") or {} - prof = data.get("profile") or {} - inp = data.get("input") or {} - emails_parts = [] - if ci.get("most_probable_work_email"): - emails_parts.append(ci["most_probable_work_email"]) - if ci.get("work_emails"): - for e in ci["work_emails"]: - if e not in emails_parts: - emails_parts.append(e) - if ci.get("personal_email"): - emails_parts.append(ci["personal_email"]) - return { - "name": prof.get("full_name") or "", - "title": prof.get("headline") or "", - "company": inp.get("company_name") or "", - "industry": "", - "location": prof.get("location") or "", - "linkedin": inp.get("linkedin_url") or "", - "phones": ", ".join(ci.get("phones") or []), - "found_emails": ", ".join(emails_parts), - } - - return extract_apollo, extract_fullenrich, extract_pdl - - @app.cell(hide_code=True) def _(mo, people, apollo_key, httpx, read_cache, write_cache): apollo_results = {} @@ -257,10 +189,19 @@ def _(mo, people, pdl_key, httpx, read_cache, write_cache): pdl_results[_email] = _cached continue try: + _params = {"email": _email, "min_likelihood": 5} + if _person.get("first_name"): + _params["first_name"] = _person["first_name"] + if _person.get("last_name"): + _params["last_name"] = _person["last_name"] + if _person.get("linkedin_url"): + _params["profile"] = _person["linkedin_url"] + if _person.get("domain"): + _params["website"] = _person["domain"] _r = httpx.get( "https://api.peopledatalabs.com/v5/person/enrich", headers={"X-Api-Key": pdl_key.value}, - params={"email": _email, "min_likelihood": 5}, + params=_params, timeout=30, ) _data = _r.json() @@ -302,10 +243,10 @@ def _(mo, people, fullenrich_key, httpx, read_cache, write_cache, time): "custom": {"email": _person["email"]}, } _has_id = False - if _person.get("first_name") and _person.get("last_name"): + if _person.get("first_name") and _person.get("last_name") and _person.get("domain"): _entry["first_name"] = _person["first_name"] _entry["last_name"] = _person["last_name"] - _entry["domain"] = _person.get("domain") or _person["email"].split("@")[1] + _entry["domain"] = _person["domain"] _has_id = True if _person.get("linkedin_url"): _entry["linkedin_url"] = _person["linkedin_url"] @@ -366,40 +307,13 @@ def _(mo, people, fullenrich_key, httpx, read_cache, write_cache, time): return (fullenrich_results,) -@app.cell(hide_code=True) -def _( - mo, people, apollo_results, pdl_results, fullenrich_results, - extract_apollo, extract_pdl, extract_fullenrich, pl, -): - _rows = [] - for _person in people: - _email = _person["email"] - for _provider, _results, _extractor in [ - ("Apollo", apollo_results, extract_apollo), - ("PDL", pdl_results, extract_pdl), - ("FullEnrich", fullenrich_results, extract_fullenrich), - ]: - if _email in _results and "error" not in _results[_email]: - _extracted = _extractor(_results[_email]) - _rows.append({"email": _email, "provider": _provider, **_extracted}) - - comparison_df = pl.DataFrame(_rows) if _rows else None - - if comparison_df is not None: - mo.vstack([mo.md("## Comparison"), mo.ui.table(comparison_df)]) - else: - mo.md("## Comparison\n\n*No results yet*") - - return (comparison_df,) - - @app.cell(hide_code=True) def _(mo, apollo_results, pdl_results, fullenrich_results, json): def _fmt(d): return mo.md(f"```json\n{json.dumps(d, indent=2, default=str)}\n```") mo.vstack([ - mo.md("## Raw Results"), + mo.md("## Results"), mo.ui.tabs({ "Apollo": _fmt(apollo_results), "PDL": _fmt(pdl_results), @@ -410,24 +324,17 @@ def _(mo, apollo_results, pdl_results, fullenrich_results, json): @app.cell(hide_code=True) -def _(mo, comparison_df, apollo_results, pdl_results, fullenrich_results, json): - _items = [] - if comparison_df is not None: - _csv_bytes = comparison_df.write_csv().encode() - _items.append( - mo.download(_csv_bytes, filename="enrichment_comparison.csv", label="Download CSV") - ) - +def _(mo, apollo_results, pdl_results, fullenrich_results, json): _raw = json.dumps( {"apollo": apollo_results, "pdl": pdl_results, "fullenrich": fullenrich_results}, indent=2, default=str, ).encode() - _items.append( - mo.download(_raw, filename="enrichment_raw.json", label="Download Raw JSON") - ) - mo.vstack([mo.md("## Export"), mo.hstack(_items)]) + mo.vstack([ + mo.md("## Export"), + mo.download(_raw, filename="enrichment_raw.json", label="Download Raw JSON"), + ]) return