deslop

2026-02-24 02:36:30 +01:00
parent f8a93a5d42
commit 51120ec6f2
1 changed files with 19 additions and 112 deletions
--- a/comparison.py
+++ b/comparison.py
@@ -3,7 +3,6 @@
 # dependencies = [
 #     "marimo",
 #     "httpx",
-#     "polars",
 # ]
 # ///

@@ -25,9 +24,7 @@ def _():
    import time
    from pathlib import Path

-    import polars as pl
-
-    return Path, csv, hashlib, httpx, io, json, mo, os, pl, time
+    return Path, csv, hashlib, httpx, io, json, mo, os, time


@app.cell(hide_code=True)
@@ -138,71 +135,6 @@ def _(
    return (people,)


-@app.cell(hide_code=True)
-def _():
-    def extract_apollo(data):
-        p = data.get("person") or {}
-        org = p.get("organization") or {}
-        loc = [p.get("city"), p.get("state"), p.get("country")]
-        return {
-            "name": p.get("name") or "",
-            "title": p.get("title") or "",
-            "company": org.get("name") or "",
-            "industry": org.get("industry") or "",
-            "location": ", ".join(x for x in loc if x),
-            "linkedin": p.get("linkedin_url") or "",
-            "phones": ", ".join(p.get("phone_numbers") or []),
-            "found_emails": p.get("email") or "",
-        }
-
-    def extract_pdl(data):
-        d = data.get("data") or data
-        phones = d.get("mobile_phone") or ""
-        if not phones and d.get("phone_numbers"):
-            phones = ", ".join(d["phone_numbers"][:3])
-        emails_parts = []
-        if d.get("work_email"):
-            emails_parts.append(d["work_email"])
-        if d.get("personal_emails"):
-            emails_parts.extend(d["personal_emails"][:2])
-        return {
-            "name": d.get("full_name") or "",
-            "title": d.get("job_title") or "",
-            "company": d.get("job_company_name") or "",
-            "industry": d.get("job_company_industry") or "",
-            "location": d.get("location_name") or "",
-            "linkedin": d.get("linkedin_url") or "",
-            "phones": phones,
-            "found_emails": ", ".join(emails_parts),
-        }
-
-    def extract_fullenrich(data):
-        ci = data.get("contact_info") or {}
-        prof = data.get("profile") or {}
-        inp = data.get("input") or {}
-        emails_parts = []
-        if ci.get("most_probable_work_email"):
-            emails_parts.append(ci["most_probable_work_email"])
-        if ci.get("work_emails"):
-            for e in ci["work_emails"]:
-                if e not in emails_parts:
-                    emails_parts.append(e)
-        if ci.get("personal_email"):
-            emails_parts.append(ci["personal_email"])
-        return {
-            "name": prof.get("full_name") or "",
-            "title": prof.get("headline") or "",
-            "company": inp.get("company_name") or "",
-            "industry": "",
-            "location": prof.get("location") or "",
-            "linkedin": inp.get("linkedin_url") or "",
-            "phones": ", ".join(ci.get("phones") or []),
-            "found_emails": ", ".join(emails_parts),
-        }
-
-    return extract_apollo, extract_fullenrich, extract_pdl
-
-
@app.cell(hide_code=True)
 def _(mo, people, apollo_key, httpx, read_cache, write_cache):
    apollo_results = {}
@@ -257,10 +189,19 @@ def _(mo, people, pdl_key, httpx, read_cache, write_cache):
                    pdl_results[_email] = _cached
                    continue
                try:
+                    _params = {"email": _email, "min_likelihood": 5}
+                    if _person.get("first_name"):
+                        _params["first_name"] = _person["first_name"]
+                    if _person.get("last_name"):
+                        _params["last_name"] = _person["last_name"]
+                    if _person.get("linkedin_url"):
+                        _params["profile"] = _person["linkedin_url"]
+                    if _person.get("domain"):
+                        _params["website"] = _person["domain"]
                    _r = httpx.get(
                        "https://api.peopledatalabs.com/v5/person/enrich",
                        headers={"X-Api-Key": pdl_key.value},
-                        params={"email": _email, "min_likelihood": 5},
+                        params=_params,
                        timeout=30,
                    )
                    _data = _r.json()
@@ -302,10 +243,10 @@ def _(mo, people, fullenrich_key, httpx, read_cache, write_cache, time):
                    "custom": {"email": _person["email"]},
                }
                _has_id = False
-                if _person.get("first_name") and _person.get("last_name"):
+                if _person.get("first_name") and _person.get("last_name") and _person.get("domain"):
                    _entry["first_name"] = _person["first_name"]
                    _entry["last_name"] = _person["last_name"]
-                    _entry["domain"] = _person.get("domain") or _person["email"].split("@")[1]
+                    _entry["domain"] = _person["domain"]
                    _has_id = True
                if _person.get("linkedin_url"):
                    _entry["linkedin_url"] = _person["linkedin_url"]
@@ -366,40 +307,13 @@ def _(mo, people, fullenrich_key, httpx, read_cache, write_cache, time):
    return (fullenrich_results,)


-@app.cell(hide_code=True)
-def _(
-    mo, people, apollo_results, pdl_results, fullenrich_results,
-    extract_apollo, extract_pdl, extract_fullenrich, pl,
-):
-    _rows = []
-    for _person in people:
-        _email = _person["email"]
-        for _provider, _results, _extractor in [
-            ("Apollo", apollo_results, extract_apollo),
-            ("PDL", pdl_results, extract_pdl),
-            ("FullEnrich", fullenrich_results, extract_fullenrich),
-        ]:
-            if _email in _results and "error" not in _results[_email]:
-                _extracted = _extractor(_results[_email])
-                _rows.append({"email": _email, "provider": _provider, **_extracted})
-
-    comparison_df = pl.DataFrame(_rows) if _rows else None
-
-    if comparison_df is not None:
-        mo.vstack([mo.md("## Comparison"), mo.ui.table(comparison_df)])
-    else:
-        mo.md("## Comparison\n\n*No results yet*")
-
-    return (comparison_df,)
-
-
@app.cell(hide_code=True)
 def _(mo, apollo_results, pdl_results, fullenrich_results, json):
    def _fmt(d):
        return mo.md(f"```json\n{json.dumps(d, indent=2, default=str)}\n```")

    mo.vstack([
-        mo.md("## Raw Results"),
+        mo.md("## Results"),
        mo.ui.tabs({
            "Apollo": _fmt(apollo_results),
            "PDL": _fmt(pdl_results),
@@ -410,24 +324,17 @@ def _(mo, apollo_results, pdl_results, fullenrich_results, json):


@app.cell(hide_code=True)
-def _(mo, comparison_df, apollo_results, pdl_results, fullenrich_results, json):
-    _items = []
-    if comparison_df is not None:
-        _csv_bytes = comparison_df.write_csv().encode()
-        _items.append(
-            mo.download(_csv_bytes, filename="enrichment_comparison.csv", label="Download CSV")
-        )
-
+def _(mo, apollo_results, pdl_results, fullenrich_results, json):
    _raw = json.dumps(
        {"apollo": apollo_results, "pdl": pdl_results, "fullenrich": fullenrich_results},
        indent=2,
        default=str,
    ).encode()
-    _items.append(
-        mo.download(_raw, filename="enrichment_raw.json", label="Download Raw JSON")
-    )

-    mo.vstack([mo.md("## Export"), mo.hstack(_items)])
+    mo.vstack([
+        mo.md("## Export"),
+        mo.download(_raw, filename="enrichment_raw.json", label="Download Raw JSON"),
+    ])
    return