from pathlib import Path
import re

path = Path(r'd:\kreuzberg\pipeline_claim\index.html')
text = path.read_text(encoding='utf-8', errors='replace')

# Remove page ranges like "p1" or "p1-p7" from citations in Issues Presented
text = re.sub(r'(DOC\d{2})\s*p\d+(?:-p\d+)?', r'\1', text)

# Clean any leftover double spaces and punctuation
text = re.sub(r'\s{2,}', ' ', text)
text = text.replace(' ;', ';').replace('( ;', '(')

path.write_text(text, encoding='utf-8')
print('removed page ranges from citations')
