from pathlib import Path
import re

path = Path(r'd:\kreuzberg\pipeline_claim\index.html')
text = path.read_text(encoding='utf-8', errors='replace')

# Remove view pX links
text = re.sub(r'\s*<a class="cite-page"[^>]*>view p\d+</a>', '', text)

# Remove stray double spaces before punctuation
text = re.sub(r'\s+\)', ')', text)
text = re.sub(r'\(\s+', '(', text)
text = re.sub(r'\s{2,}', ' ', text)
text = text.replace(' ;', ';').replace('( ;', '(')

path.write_text(text, encoding='utf-8')
print('removed view page links')
