#pip install anthropic


import anthropic
import pdfplumber
import csv
import json
import os
from docx import Document
from datetime import datetime


import anthropic
import pdfplumber
import csv
import json
import re
import os
from docx import Document
from datetime import datetime


CODEBOOK = """
A step is any discrete action a member must take to exercise a single privacy right.

Step types:
- Locate: finding information needed to act
- Act: taking an action
- Verify: providing identity confirmation

What counts as a step:
1. Locating contact information [Locate]
2. Initiating contact [Act]
3. Verifying identity [Verify]
4. Stating the request verbally or in writing [Act]
5. Completing a form [Act]
6. Mailing or submitting a document [Act]
7. Navigating to a separate system [Act]

Flags (recorded separately):
- Waiting period
- Insurer may decline the request
- Prior disclosures cannot be undone
- Opt-out not available for this data type
- Do Not Track signals ignored
- Default opt-in
- No digital opt-out exists

Coding rules:
- Code the most burdensome pathway described
- Do not infer steps not described in the document
- If multiple opt-out mechanisms exist, code each separately
"""

documents = [
    {"insurer": "Aetna", "state": "Georgia", "doc_type": "Web Privacy Policy",
     "path": r"C:\Users\victo\OneDrive\Desktop\Privacy Policies\Aetna privacy.docx"},
    {"insurer": "Anthem BCBS", "state": "Georgia", "doc_type": "HIPAA Notice",
     "path": r"C:\Users\victo\OneDrive\Desktop\Privacy Policies\anthem BCBS privacy practices.pdf"},
    {"insurer": "Anthem BCBS", "state": "Georgia", "doc_type": "HIPAA Notice (Spanish)",
     "path": r"C:\Users\victo\OneDrive\Desktop\Privacy Policies\anthem BCBS privacy spanish.pdf"},
    {"insurer": "Cigna", "state": "Georgia", "doc_type": "Data Sharing Notice",
     "path": r"C:\Users\victo\OneDrive\Desktop\Privacy Policies\Cigna privacy data sharing.docx"},
    {"insurer": "Cigna", "state": "Georgia", "doc_type": "Global Health Benefits Notice",
     "path": r"C:\Users\victo\OneDrive\Desktop\Privacy Policies\cigna-global-health-benefits-privacy-notice-eng_copy.pdf"},
    {"insurer": "Cigna", "state": "Georgia", "doc_type": "HIPAA Notice",
     "path": r"C:\Users\victo\OneDrive\Desktop\Privacy Policies\cigna-health-care-and-cigna-supplemental-benefits-privacy-notice-eng_copy.pdf"},
    {"insurer": "Cigna", "state": "Georgia", "doc_type": "GLB Notice",
     "path": r"C:\Users\victo\OneDrive\Desktop\Privacy Policies\gramm-leach-bliley-act-privacy-notice_copy.pdf"},
    {"insurer": "Humana", "state": "Georgia", "doc_type": "HIPAA Notice",
     "path": r"C:\Users\victo\OneDrive\Desktop\Privacy Policies\humana privacy practices.pdf"},
    {"insurer": "UnitedHealthcare", "state": "Georgia", "doc_type": "Web Privacy Policy",
     "path": r"C:\Users\victo\OneDrive\Desktop\Privacy Policies\UHC privacy.docx"},
    {"insurer": "UnitedHealthcare", "state": "Georgia", "doc_type": "HIPAA Notice",
     "path": r"C:\Users\victo\OneDrive\Desktop\Privacy Policies\united hipaa privacy.pdf"},
]

def extract_text(path):
    if path.endswith(".pdf"):
        with pdfplumber.open(path) as pdf:
            return " ".join(
                page.extract_text() for page in pdf.pages
                if page.extract_text()
            )
    elif path.endswith(".docx"):
        doc = Document(path)
        return " ".join(para.text for para in doc.paragraphs if para.text)
    return ""

def parse_response(raw):
    clean = raw.replace("```json", "").replace("```", "").strip()
    clean = clean.encode("ascii", "ignore").decode("ascii")
    match = re.search(r'\{.*\}', clean, re.DOTALL)
    if not match:
        raise ValueError("No JSON object found in response")
    json_text = match.group(0)
    return json.loads(json_text)

def repair_json(broken_text, client):
    print("  Attempting JSON repair...")
    repair_prompt = f"""The following text is supposed to be a JSON object but has syntax errors.
Fix it and return valid JSON only. No explanation, no markdown, just the JSON object.

{broken_text[:4000]}"""
    response = client.messages.create(
        model="claude-sonnet-4-6",
        max_tokens=1000,
        messages=[{"role": "user", "content": repair_prompt}]
    )
    return parse_response(response.content[0].text.strip())

def code_document(insurer, state, doc_type, text, client, text_limit=8000, max_tokens=1000):
    prompt = f"""You are a systematic research coder applying a privacy policy codebook.

CODEBOOK:
{CODEBOOK}

DOCUMENT:
Insurer: {insurer}
State: {state}
Document type: {doc_type}
Text: {text[:text_limit]}

Identify ALL opt-out pathways described. Return ONLY a valid JSON object.

STRICT JSON RULES:
- ASCII characters only
- No newlines inside string values
- No double quotes inside string values, use single quotes instead
- Keep all string values under 60 characters
- flags_list items must be under 40 characters each
- ambiguous_notes under 80 characters

{{
  "insurer": "{insurer}",
  "state": "{state}",
  "doc_type": "{doc_type}",
  "pathways": [
    {{
      "opt_out_pathway": "pathway name",
      "total_steps": 0,
      "locate_steps": 0,
      "act_steps": 0,
      "verify_steps": 0,
      "steps_detail": [
        {{
          "step_number": 1,
          "type": "Locate or Act or Verify",
          "description": "what member must do"
        }}
      ],
      "total_flags": 0,
      "flags_list": ["flag name"],
      "opt_in_steps": "not specified",
      "asymmetry_exists": false,
      "ambiguous_notes": "brief note"
    }}
  ]
}}"""

    response = client.messages.create(
        model="claude-sonnet-4-6",
        max_tokens=max_tokens,
        messages=[{"role": "user", "content": prompt}]
    )

    raw = response.content[0].text.strip()

    try:
        return parse_response(raw)
    except (json.JSONDecodeError, ValueError) as e:
        print(f"\n=== JSON ERROR ===")
        print(e)
        print(f"\n=== RAW OUTPUT (first 500 chars) ===")
        print(raw[:500])
        print("=================\n")
        try:
            return repair_json(raw, client)
        except Exception as e2:
            print(f"  Repair also failed: {e2}")
            raise

run_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
all_rows = []
raw_outputs = []

for doc in documents:
    print(f"Coding: {doc['insurer']} - {doc['doc_type']}")
    try:
        text = extract_text(doc["path"])

        # Use reduced text limit and higher token budget for complex documents
        if doc["doc_type"] == "Data Sharing Notice":
            text_limit = 4000
            max_tokens = 1500
        else:
            text_limit = 8000
            max_tokens = 1000

        result = code_document(
            doc["insurer"], doc["state"], doc["doc_type"],
            text, client, text_limit, max_tokens
        )
        raw_outputs.append(result)

        for pathway in result["pathways"]:
            row = {
                "run_timestamp": run_timestamp,
                "insurer": result["insurer"],
                "state": result["state"],
                "doc_type": result["doc_type"],
                "opt_out_pathway": pathway["opt_out_pathway"],
                "total_steps": pathway["total_steps"],
                "locate_steps": pathway["locate_steps"],
                "act_steps": pathway["act_steps"],
                "verify_steps": pathway["verify_steps"],
                "total_flags": pathway["total_flags"],
                "flags_list": "; ".join(pathway["flags_list"]),
                "opt_in_steps": pathway["opt_in_steps"],
                "asymmetry_exists": pathway["asymmetry_exists"],
                "ambiguous_notes": pathway.get("ambiguous_notes", ""),
                "coder": "claude-sonnet-4-6",
                "date_coded": run_timestamp,
            }
            all_rows.append(row)
        print(f"  Found {len(result['pathways'])} pathway(s)")

    except Exception as e:
        print(f"  Failed: {e}")

csv_path = r"C:\Users\victo\OneDrive\Desktop\Privacy Policies\opt_out_step_counts.csv"
json_path = r"C:\Users\victo\OneDrive\Desktop\Privacy Policies\opt_out_raw_outputs.json"

if all_rows:
    with open(csv_path, "w", newline="", encoding="utf-8") as f:
        fieldnames = [
            "run_timestamp", "insurer", "state", "doc_type", "opt_out_pathway",
            "total_steps", "locate_steps", "act_steps", "verify_steps",
            "total_flags", "flags_list", "opt_in_steps", "asymmetry_exists",
            "ambiguous_notes", "coder", "date_coded"
        ]
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(all_rows)

    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(raw_outputs, f, indent=2)

    print(f"\nDone. {len(all_rows)} pathways coded across {len(documents)} documents.")
    print(f"CSV: {csv_path}")
    print(f"JSON: {json_path}")
else:
    print("No rows to save.")

Coding: Aetna - Web Privacy Policy
  Found 1 pathway(s)
Coding: Anthem BCBS - HIPAA Notice
  Found 3 pathway(s)
Coding: Anthem BCBS - HIPAA Notice (Spanish)
  Found 3 pathway(s)
Coding: Cigna - Data Sharing Notice
  Found 2 pathway(s)
Coding: Cigna - Global Health Benefits Notice
  Found 1 pathway(s)
Coding: Cigna - HIPAA Notice
  Found 1 pathway(s)
Coding: Cigna - GLB Notice
  Found 1 pathway(s)
Coding: Humana - HIPAA Notice
  Found 1 pathway(s)
Coding: UnitedHealthcare - Web Privacy Policy
  Found 1 pathway(s)
Coding: UnitedHealthcare - HIPAA Notice
  Found 3 pathway(s)

Done. 17 pathways coded across 10 documents.
CSV: C:\Users\victo\OneDrive\Desktop\Privacy Policies\opt_out_step_counts.csv
JSON: C:\Users\victo\OneDrive\Desktop\Privacy Policies\opt_out_raw_outputs.json

Opt-Out Step Count Codebook¶

Methodological grounding¶

Definition¶

Step types¶

What counts as a step¶

Asymmetry measure¶

Flags (recorded separately, not as steps)¶

Coding rules¶

Scoring¶

Output format¶

Scope and limitations¶

Generative AI Statement¶

References¶

Opt-Out Step Count Codebook¶

Methodological grounding¶

Definition¶

Step types¶

What counts as a step¶

Asymmetry measure¶

Flags (recorded separately, not as steps)¶

Coding rules¶

Scoring¶

Output format¶

Scope and limitations¶

Preliminary Findings: Consent Friction in Health Insurance Privacy Notices¶

Generative AI Statement¶

References¶