deslop

2026-02-24 02:36:30 +01:00
parent f8a93a5d42
commit 51120ec6f2
1 changed files with 19 additions and 112 deletions
--- a/comparison.py
+++ b/comparison.py
@@ -3,7 +3,6 @@
 # dependencies = [
 #     "marimo",
 #     "httpx",
 #     "polars",
 # ]
 # ///
@@ -25,9 +24,7 @@ def _():
    import time
    from pathlib import Path
-    import polars as pl
+    return Path, csv, hashlib, httpx, io, json, mo, os, time
    return Path, csv, hashlib, httpx, io, json, mo, os, pl, time
@app.cell(hide_code=True)
@@ -138,71 +135,6 @@ def _(
    return (people,)
@app.cell(hide_code=True)
 def _():
    def extract_apollo(data):
        p = data.get("person") or {}
        org = p.get("organization") or {}
        loc = [p.get("city"), p.get("state"), p.get("country")]
        return {
            "name": p.get("name") or "",
            "title": p.get("title") or "",
            "company": org.get("name") or "",
            "industry": org.get("industry") or "",
            "location": ", ".join(x for x in loc if x),
            "linkedin": p.get("linkedin_url") or "",
            "phones": ", ".join(p.get("phone_numbers") or []),
            "found_emails": p.get("email") or "",
        }
    def extract_pdl(data):
        d = data.get("data") or data
        phones = d.get("mobile_phone") or ""
        if not phones and d.get("phone_numbers"):
            phones = ", ".join(d["phone_numbers"][:3])
        emails_parts = []
        if d.get("work_email"):
            emails_parts.append(d["work_email"])
        if d.get("personal_emails"):
            emails_parts.extend(d["personal_emails"][:2])
        return {
            "name": d.get("full_name") or "",
            "title": d.get("job_title") or "",
            "company": d.get("job_company_name") or "",
            "industry": d.get("job_company_industry") or "",
            "location": d.get("location_name") or "",
            "linkedin": d.get("linkedin_url") or "",
            "phones": phones,
            "found_emails": ", ".join(emails_parts),
        }
    def extract_fullenrich(data):
        ci = data.get("contact_info") or {}
        prof = data.get("profile") or {}
        inp = data.get("input") or {}
        emails_parts = []
        if ci.get("most_probable_work_email"):
            emails_parts.append(ci["most_probable_work_email"])
        if ci.get("work_emails"):
            for e in ci["work_emails"]:
                if e not in emails_parts:
                    emails_parts.append(e)
        if ci.get("personal_email"):
            emails_parts.append(ci["personal_email"])
        return {
            "name": prof.get("full_name") or "",
            "title": prof.get("headline") or "",
            "company": inp.get("company_name") or "",
            "industry": "",
            "location": prof.get("location") or "",
            "linkedin": inp.get("linkedin_url") or "",
            "phones": ", ".join(ci.get("phones") or []),
            "found_emails": ", ".join(emails_parts),
        }
    return extract_apollo, extract_fullenrich, extract_pdl
@app.cell(hide_code=True)
 def _(mo, people, apollo_key, httpx, read_cache, write_cache):
    apollo_results = {}
@@ -257,10 +189,19 @@ def _(mo, people, pdl_key, httpx, read_cache, write_cache):
                    pdl_results[_email] = _cached
                    continue
                try:
                    _params = {"email": _email, "min_likelihood": 5}
                    if _person.get("first_name"):
                        _params["first_name"] = _person["first_name"]
                    if _person.get("last_name"):
                        _params["last_name"] = _person["last_name"]
                    if _person.get("linkedin_url"):
                        _params["profile"] = _person["linkedin_url"]
                    if _person.get("domain"):
                        _params["website"] = _person["domain"]
                    _r = httpx.get(
                        "https://api.peopledatalabs.com/v5/person/enrich",
                        headers={"X-Api-Key": pdl_key.value},
-                        params={"email": _email, "min_likelihood": 5},
+                        params=_params,
                        timeout=30,
                    )
                    _data = _r.json()
@@ -302,10 +243,10 @@ def _(mo, people, fullenrich_key, httpx, read_cache, write_cache, time):
                    "custom": {"email": _person["email"]},
                }
                _has_id = False
-                if _person.get("first_name") and _person.get("last_name"):
+                if _person.get("first_name") and _person.get("last_name") and _person.get("domain"):
                    _entry["first_name"] = _person["first_name"]
                    _entry["last_name"] = _person["last_name"]
-                    _entry["domain"] = _person.get("domain") or _person["email"].split("@")[1]
+                    _entry["domain"] = _person["domain"]
                    _has_id = True
                if _person.get("linkedin_url"):
                    _entry["linkedin_url"] = _person["linkedin_url"]
@@ -366,40 +307,13 @@ def _(mo, people, fullenrich_key, httpx, read_cache, write_cache, time):
    return (fullenrich_results,)
@app.cell(hide_code=True)
 def _(
    mo, people, apollo_results, pdl_results, fullenrich_results,
    extract_apollo, extract_pdl, extract_fullenrich, pl,
 ):
    _rows = []
    for _person in people:
        _email = _person["email"]
        for _provider, _results, _extractor in [
            ("Apollo", apollo_results, extract_apollo),
            ("PDL", pdl_results, extract_pdl),
            ("FullEnrich", fullenrich_results, extract_fullenrich),
        ]:
            if _email in _results and "error" not in _results[_email]:
                _extracted = _extractor(_results[_email])
                _rows.append({"email": _email, "provider": _provider, **_extracted})
    comparison_df = pl.DataFrame(_rows) if _rows else None
    if comparison_df is not None:
        mo.vstack([mo.md("## Comparison"), mo.ui.table(comparison_df)])
    else:
        mo.md("## Comparison\n\n*No results yet*")
    return (comparison_df,)
@app.cell(hide_code=True)
 def _(mo, apollo_results, pdl_results, fullenrich_results, json):
    def _fmt(d):
        return mo.md(f"```json\n{json.dumps(d, indent=2, default=str)}\n```")
    mo.vstack([
-        mo.md("## Raw Results"),
+        mo.md("## Results"),
        mo.ui.tabs({
            "Apollo": _fmt(apollo_results),
            "PDL": _fmt(pdl_results),
@@ -410,24 +324,17 @@ def _(mo, apollo_results, pdl_results, fullenrich_results, json):
@app.cell(hide_code=True)
-def _(mo, comparison_df, apollo_results, pdl_results, fullenrich_results, json):
+def _(mo, apollo_results, pdl_results, fullenrich_results, json):
    _items = []
    if comparison_df is not None:
        _csv_bytes = comparison_df.write_csv().encode()
        _items.append(
            mo.download(_csv_bytes, filename="enrichment_comparison.csv", label="Download CSV")
        )
    _raw = json.dumps(
        {"apollo": apollo_results, "pdl": pdl_results, "fullenrich": fullenrich_results},
        indent=2,
        default=str,
    ).encode()
    _items.append(
        mo.download(_raw, filename="enrichment_raw.json", label="Download Raw JSON")
    )
-    mo.vstack([mo.md("## Export"), mo.hstack(_items)])
+    mo.vstack([
        mo.md("## Export"),
        mo.download(_raw, filename="enrichment_raw.json", label="Download Raw JSON"),
    ])
    return