| | |
| | """ |
| | UltraData Math Parser - Hugging Face Space Demo |
| | A unified HTML parser optimized for extracting mathematical content. |
| | """ |
| |
|
| | import gradio as gr |
| | import requests |
| | from ultradata_math_parser import GeneralParser |
| |
|
| |
|
| | def fetch_url_content(url: str) -> tuple: |
| | """Fetch HTML content from a URL.""" |
| | if not url or not url.strip(): |
| | return "", "Please enter a URL" |
| | |
| | url = url.strip() |
| | if not url.startswith(("http://", "https://")): |
| | url = "https://" + url |
| | |
| | try: |
| | headers = { |
| | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" |
| | } |
| | response = requests.get(url, headers=headers, timeout=15) |
| | response.raise_for_status() |
| | return response.text, url |
| | except requests.exceptions.Timeout: |
| | return "", f"Request timed out for {url}" |
| | except requests.exceptions.RequestException as e: |
| | return "", f"Failed to fetch URL: {str(e)}" |
| |
|
| |
|
| | def fetch_and_parse(url: str, process_math: bool, include_tables: bool, enable_forum: bool, html_type: str) -> tuple: |
| | """Fetch URL content and parse it in one step.""" |
| | html_content, base_url = fetch_url_content(url) |
| | |
| | if not html_content: |
| | |
| | error_msg = base_url |
| | return "", error_msg, f"β {error_msg}", "", "", f"**Error:** {error_msg}" |
| | |
| | result = parse_html( |
| | html_content=html_content, |
| | base_url=base_url, |
| | process_math=process_math, |
| | include_tables=include_tables, |
| | enable_forum_assembly=enable_forum, |
| | html_type=html_type, |
| | ) |
| | |
| | formatted = format_output(result) |
| | |
| | return html_content, base_url, formatted[0], formatted[1], formatted[2], formatted[3] |
| |
|
| |
|
| | def parse_html( |
| | html_content: str, |
| | base_url: str = "", |
| | process_math: bool = True, |
| | include_tables: bool = True, |
| | enable_forum_assembly: bool = True, |
| | html_type: str = "unified", |
| | ) -> dict: |
| | """ |
| | Parse HTML content using GeneralParser. |
| | |
| | Args: |
| | html_content: Raw HTML string to parse |
| | base_url: Base URL for resolving relative links |
| | process_math: Whether to process and convert math expressions |
| | include_tables: Whether to preserve table elements |
| | enable_forum_assembly: Whether to enable forum post assembly |
| | html_type: Parser type (unified/article/forum) |
| | |
| | Returns: |
| | Dictionary containing parsed results |
| | """ |
| | if not html_content or not html_content.strip(): |
| | return { |
| | "title": "", |
| | "html": "", |
| | "text": "", |
| | "text_length": 0, |
| | "xp_num": "", |
| | "fallback_strategy": "", |
| | "forum_assembled": False, |
| | "error": "Please provide HTML content to parse.", |
| | } |
| | |
| | parser = GeneralParser() |
| | |
| | try: |
| | result = parser.extract( |
| | html=html_content, |
| | base_url=base_url, |
| | process_math=process_math, |
| | include_tables=include_tables, |
| | enable_forum_assembly=enable_forum_assembly, |
| | html_type=html_type, |
| | ) |
| | |
| | return { |
| | "title": result.get("title", ""), |
| | "html": result.get("html", ""), |
| | "text": result.get("text", ""), |
| | "text_length": result.get("text_length", 0), |
| | "xp_num": result.get("xp_num", ""), |
| | "fallback_strategy": result.get("fallback_strategy", ""), |
| | "forum_assembled": result.get("forum_assembled", False), |
| | "error": None, |
| | } |
| | except Exception as e: |
| | return { |
| | "title": "", |
| | "html": "", |
| | "text": "", |
| | "text_length": 0, |
| | "xp_num": "", |
| | "fallback_strategy": "", |
| | "forum_assembled": False, |
| | "error": str(e), |
| | } |
| |
|
| |
|
| | def format_output(result: dict) -> tuple: |
| | """Format the parser output for Gradio display.""" |
| | if result.get("error"): |
| | return ( |
| | f"β Error: {result['error']}", |
| | "", |
| | "", |
| | f"**Error:** {result['error']}", |
| | ) |
| | |
| | |
| | text_content = result.get("text", "") |
| | markdown_content = text_content if text_content else "_No content extracted_" |
| | |
| | return ( |
| | result.get("title", ""), |
| | result.get("html", ""), |
| | result.get("text", ""), |
| | markdown_content, |
| | ) |
| |
|
| |
|
| | def process_input(html_content, base_url, process_math, include_tables, enable_forum, html_type): |
| | """Main processing function for Gradio interface.""" |
| | result = parse_html( |
| | html_content=html_content, |
| | base_url=base_url, |
| | process_math=process_math, |
| | include_tables=include_tables, |
| | enable_forum_assembly=enable_forum, |
| | html_type=html_type, |
| | ) |
| | return format_output(result) |
| |
|
| |
|
| | |
| | EXAMPLE_HTML = """<!DOCTYPE html> |
| | <html> |
| | <head> |
| | <title>Quadratic Formula Example</title> |
| | </head> |
| | <body> |
| | <article class="post-content"> |
| | <h1>Understanding the Quadratic Formula</h1> |
| | <p>The quadratic formula is used to solve equations of the form axΒ² + bx + c = 0.</p> |
| | <p>The solution is given by:</p> |
| | <math xmlns="http://www.w3.org/1998/Math/MathML"> |
| | <mi>x</mi> |
| | <mo>=</mo> |
| | <mfrac> |
| | <mrow> |
| | <mo>-</mo> |
| | <mi>b</mi> |
| | <mo>Β±</mo> |
| | <msqrt> |
| | <mrow> |
| | <msup><mi>b</mi><mn>2</mn></msup> |
| | <mo>-</mo> |
| | <mn>4</mn> |
| | <mi>a</mi> |
| | <mi>c</mi> |
| | </mrow> |
| | </msqrt> |
| | </mrow> |
| | <mrow> |
| | <mn>2</mn> |
| | <mi>a</mi> |
| | </mrow> |
| | </mfrac> |
| | </math> |
| | <p>Where a, b, and c are coefficients of the quadratic equation.</p> |
| | <h2>Example Problem</h2> |
| | <p>Solve: xΒ² - 5x + 6 = 0</p> |
| | <p>Here, a = 1, b = -5, c = 6</p> |
| | <p>Using the formula: x = (5 Β± β(25-24))/2 = (5 Β± 1)/2</p> |
| | <p>Therefore, x = 3 or x = 2</p> |
| | </article> |
| | <footer> |
| | <nav>Related articles...</nav> |
| | </footer> |
| | </body> |
| | </html>""" |
| |
|
| |
|
| | |
| | custom_css = """ |
| | @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600;700&family=JetBrains+Mono:wght@400;500&display=swap'); |
| | |
| | :root { |
| | --bg: #f3f5fb; |
| | --card: #ffffff; |
| | --border: #e5e9f2; |
| | --text: #1f2a44; |
| | --muted: #6b7280; |
| | --accent: #5b6ff4; |
| | --accent-2: #7a8af7; |
| | --input: #f7f9fc; |
| | } |
| | |
| | body { |
| | background-color: var(--bg) !important; |
| | color: var(--text) !important; |
| | } |
| | |
| | .gradio-container { |
| | font-family: 'Inter', sans-serif !important; |
| | background: var(--bg) !important; |
| | width: 100% !important; |
| | max-width: none !important; |
| | margin: 0 auto !important; |
| | padding: clamp(12px, 2vw, 28px) clamp(10px, 3vw, 32px) clamp(20px, 3vw, 48px) !important; |
| | } |
| | |
| | .main-title { |
| | text-align: center !important; |
| | font-weight: 800 !important; |
| | font-size: 2.4rem !important; |
| | color: #3b3fb6 !important; |
| | margin-bottom: 0.25rem !important; |
| | } |
| | |
| | .subtitle { |
| | text-align: center !important; |
| | color: var(--muted) !important; |
| | font-size: 1rem !important; |
| | margin-bottom: 1.5rem !important; |
| | } |
| | |
| | .glass-panel { |
| | background: var(--card) !important; |
| | border: 1px solid var(--border) !important; |
| | border-radius: 16px !important; |
| | padding: 20px !important; |
| | box-shadow: 0 10px 24px rgba(31, 42, 68, 0.08) !important; |
| | } |
| | |
| | .section-header { |
| | color: var(--text) !important; |
| | font-weight: 700 !important; |
| | font-size: 1.05rem !important; |
| | margin-bottom: 1rem !important; |
| | padding-bottom: 0.5rem !important; |
| | border-bottom: 1px solid var(--border) !important; |
| | } |
| | |
| | .block > label > span, |
| | .form > label > span, |
| | .gr-form > label > span, |
| | .label-wrap > span, |
| | span.description, |
| | .description { |
| | color: var(--text) !important; |
| | text-shadow: none !important; |
| | } |
| | |
| | fieldset legend, fieldset legend span, |
| | .gr-radio > label, .gr-radio > label span { |
| | color: var(--text) !important; |
| | font-weight: 600 !important; |
| | } |
| | |
| | .gr-input, textarea, input, .gr-box, .gr-check-radio, .gr-dropdown { |
| | font-family: 'JetBrains Mono', monospace !important; |
| | background-color: var(--input) !important; |
| | border: 1px solid var(--border) !important; |
| | color: var(--text) !important; |
| | box-shadow: none !important; |
| | } |
| | |
| | .gr-input:focus, textarea:focus, input:focus { |
| | border-color: var(--accent) !important; |
| | background-color: #ffffff !important; |
| | } |
| | |
| | .tabs { |
| | border: none !important; |
| | margin-bottom: 1rem !important; |
| | } |
| | |
| | .tab-nav { |
| | border-bottom: 1px solid var(--border) !important; |
| | justify-content: center !important; |
| | gap: 8px !important; |
| | } |
| | |
| | .tab-nav button { |
| | font-weight: 600 !important; |
| | font-size: 0.95rem !important; |
| | color: var(--muted) !important; |
| | background: transparent !important; |
| | border-radius: 8px !important; |
| | padding: 6px 12px !important; |
| | } |
| | |
| | .tab-nav button.selected { |
| | color: #ffffff !important; |
| | background: var(--accent) !important; |
| | border-bottom: none !important; |
| | } |
| | |
| | .gr-button-primary { |
| | background: linear-gradient(135deg, var(--accent) 0%, var(--accent-2) 100%) !important; |
| | border: none !important; |
| | color: #ffffff !important; |
| | font-weight: 600 !important; |
| | border-radius: 8px !important; |
| | box-shadow: 0 6px 14px rgba(91, 111, 244, 0.28) !important; |
| | } |
| | |
| | .gr-button-primary:hover { |
| | filter: brightness(0.98) !important; |
| | } |
| | |
| | .gr-button-secondary { |
| | background: #f1f4fb !important; |
| | border: 1px solid var(--border) !important; |
| | color: var(--text) !important; |
| | border-radius: 8px !important; |
| | } |
| | |
| | .output-textbox, .markdown-box { |
| | background-color: #fbfcff !important; |
| | border-radius: 12px !important; |
| | } |
| | |
| | .output-textbox textarea { |
| | background-color: transparent !important; |
| | border: none !important; |
| | box-shadow: none !important; |
| | color: var(--text) !important; |
| | } |
| | |
| | .markdown-box .prose { |
| | color: var(--text) !important; |
| | border: none !important; |
| | box-shadow: none !important; |
| | background: transparent !important; |
| | } |
| | |
| | .markdown-box .prose, |
| | .markdown-box .prose * , |
| | .markdown-box .md, |
| | .markdown-box .wrap, |
| | .markdown-box .wrap * , |
| | .markdown-box .label-wrap, |
| | .markdown-box .block, |
| | .markdown-box .form, |
| | .markdown-box .container, |
| | .markdown-box .box { |
| | border: none !important; |
| | box-shadow: none !important; |
| | background: transparent !important; |
| | padding: 0 !important; |
| | } |
| | |
| | /* Target the specific container that usually holds the markdown output */ |
| | .markdown-box > .wrap > .block, |
| | .markdown-box > .block { |
| | border: none !important; |
| | background: transparent !important; |
| | } |
| | |
| | .footer-text { |
| | text-align: center !important; |
| | margin-top: 2rem !important; |
| | padding-top: 1.5rem !important; |
| | color: var(--muted) !important; |
| | border-top: 1px solid var(--border) !important; |
| | font-size: 0.9rem !important; |
| | } |
| | """ |
| |
|
| | |
| | with gr.Blocks(title="UltraData-Math-L0-Parser", css=custom_css, theme=gr.themes.Soft()) as demo: |
| | gr.HTML('<h1 class="main-title">UltraData-Math-L0-Parser</h1>') |
| | gr.HTML('<p class="subtitle">Unified HTML Parser for Mathematical Content Extraction</p>') |
| | |
| | with gr.Row(): |
| | with gr.Column(scale=1, elem_classes=["glass-panel"]): |
| | gr.HTML('<div class="section-header">Input</div>') |
| | |
| | with gr.Tabs(): |
| | with gr.TabItem("π URL"): |
| | url_input = gr.Textbox( |
| | label="URL", |
| | placeholder="Enter URL to fetch (e.g., https://example.com/math-article)", |
| | lines=3, |
| | max_lines=5, |
| | value="https://math.stackexchange.com/questions/5120625/ode-problem-of-yt-sqrtyt-with-the-inital-value-y0-1-t-geq-0", |
| | ) |
| | fetch_btn = gr.Button("Fetch & Parse", variant="primary", size="lg") |
| | |
| | with gr.TabItem("π HTML"): |
| | pass |
| | |
| | html_input = gr.Textbox( |
| | label="HTML Content", |
| | placeholder="Paste your HTML content here or fetch from URL above...", |
| | lines=10, |
| | max_lines=20, |
| | value=EXAMPLE_HTML, |
| | ) |
| | |
| | base_url_input = gr.Textbox( |
| | label="Base URL (Auto-filled from URL fetch)", |
| | placeholder="https://example.com/page", |
| | lines=1, |
| | ) |
| | |
| | with gr.Accordion("βοΈ Advanced Options", open=False, visible=False): |
| | html_type = gr.Radio( |
| | choices=["unified", "article", "forum"], |
| | value="unified", |
| | label="Parser Type", |
| | info="Select the parsing strategy", |
| | ) |
| | process_math = gr.Checkbox( |
| | label="Process Math Expressions", |
| | value=True, |
| | info="Convert MathML and LaTeX to unified format", |
| | ) |
| | include_tables = gr.Checkbox( |
| | label="Include Tables", |
| | value=True, |
| | info="Preserve table elements in output", |
| | ) |
| | enable_forum = gr.Checkbox( |
| | label="Enable Forum Assembly", |
| | value=True, |
| | info="Assemble forum posts and comments", |
| | ) |
| | |
| | with gr.Row(): |
| | parse_btn = gr.Button("π Parse HTML", variant="primary", size="lg") |
| | clear_btn = gr.Button("ποΈ Clear", variant="secondary", size="lg") |
| | |
| | with gr.Column(scale=1, elem_classes=["glass-panel"]): |
| | gr.HTML('<div class="section-header">Output</div>') |
| | |
| | title_output = gr.Textbox( |
| | label="Extracted Title", |
| | lines=1, |
| | interactive=False, |
| | ) |
| | |
| | with gr.Tabs(): |
| | with gr.TabItem("β¨ Markdown"): |
| | markdown_output = gr.Markdown( |
| | value="### Output will appear here...", |
| | label="Markdown Preview", |
| | elem_classes=["markdown-box"], |
| | latex_delimiters=[ |
| | {"left": "$$", "right": "$$", "display": True}, |
| | {"left": "$", "right": "$", "display": False}, |
| | {"left": "\\[", "right": "\\]", "display": True}, |
| | {"left": "\\(", "right": "\\)", "display": False}, |
| | ], |
| | ) |
| | with gr.TabItem("π Plain Text"): |
| | text_output = gr.Textbox( |
| | label="Plain Text (w3m rendered)", |
| | lines=25, |
| | max_lines=30, |
| | interactive=False, |
| | autoscroll=False, |
| | elem_classes=["output-textbox"], |
| | ) |
| | with gr.TabItem("π Raw HTML"): |
| | html_output = gr.Textbox( |
| | label="Extracted HTML", |
| | lines=25, |
| | max_lines=30, |
| | interactive=False, |
| | autoscroll=False, |
| | elem_classes=["output-textbox"], |
| | ) |
| | |
| | |
| | fetch_btn.click( |
| | fn=fetch_and_parse, |
| | inputs=[url_input, process_math, include_tables, enable_forum, html_type], |
| | outputs=[html_input, base_url_input, title_output, html_output, text_output, markdown_output], |
| | ) |
| | |
| | parse_btn.click( |
| | fn=process_input, |
| | inputs=[html_input, base_url_input, process_math, include_tables, enable_forum, html_type], |
| | outputs=[title_output, html_output, text_output, markdown_output], |
| | ) |
| | |
| | def clear_all(): |
| | return "", "", "", "", "", "", "" |
| | |
| | clear_btn.click( |
| | fn=clear_all, |
| | outputs=[url_input, html_input, base_url_input, title_output, html_output, text_output, markdown_output], |
| | ) |
| | |
| | |
| | gr.HTML(""" |
| | <div class="footer-text"> |
| | <p>π¬ <strong>UltraData Math Parser</strong> - Part of the UltraData-Math Project</p> |
| | <p>Specialized in extracting mathematical content from web pages with MathML, LaTeX, and formula support.</p> |
| | </div> |
| | """) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | demo.launch(ssr_mode=False) |
| |
|