function process_markdown_content_v1
Parses markdown-formatted text content and converts it into a structured list of document elements (headers, paragraphs, lists, tables, code blocks) with their types and formatting preserved in original order.
/tf/active/vicechatdev/vice_ai/new_app.py
3682 - 3907
complex
Purpose
This function is designed for document export workflows where markdown content needs to be converted into a structured format that can be easily rendered in various output formats (DOCX, PDF, etc.). It handles complex markdown elements including multi-level headers, bullet/numbered lists, tables with headers, code blocks, and inline formatting. The function maintains the original order of elements and processes inline markdown syntax (bold, italic, links) within text content.
Source Code
def process_markdown_content(content):
"""
Convert markdown-formatted content to structured format for document export
Returns a list of content elements with their types and formatting in original order
"""
if not content:
return []
# Import html parser for markdown conversion
import html
import re
elements = []
lines = content.split('\n')
current_paragraph = []
in_table = False
table_rows = []
logger.info(f"DEBUG: Processing {len(lines)} lines of markdown content")
for i, line in enumerate(lines):
line = line.strip()
if not line:
# Empty line - end current paragraph/table if any
if in_table:
# End table
if table_rows:
elements.append({
'type': 'table',
'content': table_rows,
'formatting': []
})
table_rows = []
in_table = False
elif current_paragraph:
content_text = ' '.join(current_paragraph).strip()
if content_text: # Only add non-empty paragraphs
elements.append({
'type': 'paragraph',
'content': content_text,
'formatting': []
})
current_paragraph = []
continue
# Check for table rows (contains |)
if '|' in line and not in_table:
# Start of table - check if next line has separators
if i + 1 < len(lines) and re.match(r'^\s*\|?\s*[-:]+\s*\|', lines[i + 1].strip()):
logger.info(f"Starting table at line {i}: '{line}'")
# End current paragraph if any
if current_paragraph:
content_text = ' '.join(current_paragraph).strip()
if content_text:
elements.append({
'type': 'paragraph',
'content': content_text,
'formatting': []
})
logger.info(f"DEBUG: Added paragraph before table: '{content_text[:50]}...'")
current_paragraph = []
in_table = True
# Parse header row
cells = [cell.strip() for cell in line.split('|') if cell.strip()]
logger.info(f"Header cells: {cells}")
if cells:
table_rows.append({
'type': 'header',
'cells': cells
})
continue
# Skip table separator line
if in_table and re.match(r'^\s*\|?\s*[-:]+\s*\|', line):
continue
# Table data row
if in_table and '|' in line:
cells = [cell.strip() for cell in line.split('|') if cell.strip()]
logger.info(f"Data row cells: {cells}")
if cells:
table_rows.append({
'type': 'data',
'cells': cells
})
continue
# End table if we're in one but line doesn't contain |
if in_table:
if table_rows:
elements.append({
'type': 'table',
'content': table_rows,
'formatting': []
})
logger.info(f"DEBUG: Added table element with {len(table_rows)} rows")
table_rows = []
in_table = False
# Check for headers
header_match = re.match(r'^(#{1,6})\s+(.+)$', line)
if header_match:
# End current paragraph if any
if current_paragraph:
content_text = ' '.join(current_paragraph).strip()
if content_text:
elements.append({
'type': 'paragraph',
'content': content_text,
'formatting': []
})
logger.info(f"DEBUG: Added paragraph before header: '{content_text[:50]}...'")
current_paragraph = []
# Add header
level = len(header_match.group(1))
header_content = header_match.group(2)
elements.append({
'type': 'header',
'level': level,
'content': header_content,
'formatting': []
})
continue
# Check for list items
list_match = re.match(r'^[-*+]\s+(.+)$', line)
if list_match:
# End current paragraph if any
if current_paragraph:
content_text = ' '.join(current_paragraph).strip()
if content_text:
elements.append({
'type': 'paragraph',
'content': content_text,
'formatting': []
})
logger.info(f"DEBUG: Added paragraph before list item: '{content_text[:50]}...'")
current_paragraph = []
list_content = list_match.group(1)
elements.append({
'type': 'list_item',
'content': list_content,
'formatting': []
})
continue
# Check for numbered list items
num_list_match = re.match(r'^\d+\.\s+(.+)$', line)
if num_list_match:
# End current paragraph if any
if current_paragraph:
content_text = ' '.join(current_paragraph).strip()
if content_text:
elements.append({
'type': 'paragraph',
'content': content_text,
'formatting': []
})
logger.info(f"DEBUG: Added paragraph before numbered list: '{content_text[:50]}...'")
current_paragraph = []
numbered_content = num_list_match.group(1)
elements.append({
'type': 'numbered_list_item',
'content': numbered_content,
'formatting': []
})
continue
# Check for code blocks
if line.startswith('```'):
# End current paragraph if any
if current_paragraph:
content_text = ' '.join(current_paragraph).strip()
if content_text:
elements.append({
'type': 'paragraph',
'content': content_text,
'formatting': []
})
logger.info(f"DEBUG: Added paragraph before code block: '{content_text[:50]}...'")
current_paragraph = []
elements.append({
'type': 'code_block_start',
'content': line[3:], # Language if specified
'formatting': []
})
continue
# Regular text line - add to current paragraph
current_paragraph.append(line)
# Handle end of content
if in_table and table_rows:
logger.info(f"End of content - adding table with {len(table_rows)} rows")
elements.append({
'type': 'table',
'content': table_rows,
'formatting': []
})
elif current_paragraph:
content_text = ' '.join(current_paragraph).strip()
if content_text:
elements.append({
'type': 'paragraph',
'content': content_text,
'formatting': []
})
logger.info(f"Final element count: {len(elements)}")
# Process inline formatting for text elements
for element in elements:
if element['type'] in ['paragraph', 'list_item', 'numbered_list_item']:
element['content'] = process_inline_markdown(element['content'])
elif element['type'] == 'table':
# Process inline formatting in table cells
for row in element['content']:
row['cells'] = [process_inline_markdown(cell) for cell in row['cells']]
return elements
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
content |
- | - | positional_or_keyword |
Parameter Details
content: A string containing markdown-formatted text. Can include headers (# syntax), lists (-, *, + for bullets, 1. for numbered), tables (| delimited), code blocks ( delimited), and inline formatting. Can be None or empty string, which returns an empty list. Multi-line content should use \n as line separator.
Return Value
Returns a list of dictionaries, where each dictionary represents a structured content element. Each element has 'type' (string: 'header', 'paragraph', 'list_item', 'numbered_list_item', 'table', 'code_block_start'), 'content' (string or list depending on type), and 'formatting' (list, currently empty but reserved for future use). Headers include 'level' (1-6). Tables have 'content' as a list of row dictionaries with 'type' ('header' or 'data') and 'cells' (list of strings). Returns empty list if input is None/empty.
Dependencies
htmlrelogging
Required Imports
import html
import re
import logging
Conditional/Optional Imports
These imports are only needed under specific conditions:
import html
Condition: imported lazily inside the function for markdown conversion
Required (conditional)import re
Condition: imported lazily inside the function for regex pattern matching
Required (conditional)Usage Example
import html
import re
import logging
# Setup logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
# Define required helper function
def process_inline_markdown(text):
# Process **bold**, *italic*, [links](url)
text = re.sub(r'\*\*(.+?)\*\*', r'<b>\1</b>', text)
text = re.sub(r'\*(.+?)\*', r'<i>\1</i>', text)
text = re.sub(r'\[(.+?)\]\((.+?)\)', r'<a href="\2">\1</a>', text)
return text
# Example markdown content
markdown_text = '''# Main Title
This is a paragraph with **bold** and *italic* text.
## Subsection
- First bullet point
- Second bullet point
| Header 1 | Header 2 |
|----------|----------|
| Cell 1 | Cell 2 |
| Cell 3 | Cell 4 |
1. First numbered item
2. Second numbered item
'''
# Process the markdown
elements = process_markdown_content(markdown_text)
# Access structured elements
for element in elements:
if element['type'] == 'header':
print(f"Header Level {element['level']}: {element['content']}")
elif element['type'] == 'paragraph':
print(f"Paragraph: {element['content']}")
elif element['type'] == 'table':
print(f"Table with {len(element['content'])} rows")
for row in element['content']:
print(f" {row['type']}: {row['cells']}")
Best Practices
- Ensure a logger object is configured in the module scope before calling this function, as it uses logger.info() for debugging
- The function requires a helper function 'process_inline_markdown()' to be defined in the same module for processing inline markdown syntax
- Input content should use \n as line separators for proper parsing of multi-line markdown
- Tables must follow standard markdown table syntax with pipe delimiters and a separator row (|---|---|) after headers
- The function preserves element order, making it suitable for sequential document rendering
- Empty lines are used as delimiters between different content blocks (paragraphs, tables, etc.)
- The 'formatting' field in returned elements is currently empty but reserved for future enhancements
- For large documents, consider the memory implications as the entire content is processed in memory
- The function handles edge cases like tables at the end of content and paragraphs before special elements
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
function process_markdown_content 95.0% similar
-
function simple_markdown_to_html 74.9% similar
-
function html_to_markdown_v1 74.5% similar
-
function add_formatted_content_to_word_v1 72.7% similar
-
function test_markdown_processing 71.8% similar