function create_enhanced_word_document_v1
Converts markdown content into a formatted Microsoft Word document with proper styling, table of contents, warranty sections, and reference handling for Project Victoria warranty disclosures.
/tf/active/vicechatdev/enhanced_word_converter_fixed.py
90 - 478
complex
Purpose
This function generates a professionally formatted Word document from markdown content containing warranty information. It creates a structured document with a title page, metadata, table of contents, warranty sections with proper hierarchical headings, and a comprehensive references section. The function handles complex markdown formatting including bold text, lists, inline references, and multi-level headings while preserving document structure and applying appropriate Word styles.
Source Code
def create_enhanced_word_document(markdown_content, output_path):
"""Create Word document from improved markdown content with proper reference handling"""
doc = Document()
# Add title
title = doc.add_heading('Project Victoria - Warranty Disclosures', 0)
title.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
# Add document metadata - get actual counts from markdown
total_refs = extract_total_references(markdown_content)
doc.add_paragraph(f"Generated on: {datetime.now().strftime('%B %d, %Y at %H:%M:%S')}")
# Extract warranty sections for TOC
warranty_sections = extract_warranty_sections(markdown_content)
doc.add_paragraph(f"Total Warranties Processed: {len(warranty_sections)}")
doc.add_paragraph(f"Total Disclosures Generated: {len(warranty_sections)}")
doc.add_paragraph(f"Total References: {total_refs}")
# Add a page break
doc.add_page_break()
# Add comprehensive table of contents
toc_heading = doc.add_heading('Table of Contents', level=1)
# Add warranty sections to TOC
doc.add_paragraph("Warranty Sections:", style='Heading 2')
for warranty in sorted(warranty_sections, key=lambda x: x['id']):
warranty_id = warranty['id']
warranty_title = warranty['title']
toc_entry = doc.add_paragraph(f"{warranty_id} - {warranty_title}")
toc_entry.paragraph_format.left_indent = Inches(0.3)
# Add References section to TOC
doc.add_paragraph("References", style='Heading 2')
ref_entry = doc.add_paragraph("Complete Bibliography and Source Documents")
ref_entry.paragraph_format.left_indent = Inches(0.3)
# Add page break before main content
doc.add_page_break()
lines = markdown_content.split('\n')
current_section = None
in_disclosure = False
in_references = False
skip_toc = False
i = 0
while i < len(lines):
line = lines[i].strip()
# Skip empty lines unless we're in a specific section
if not line:
if in_disclosure or in_references:
# Preserve spacing in disclosure and reference sections
doc.add_paragraph("")
i += 1
continue
# Handle escaped newlines from the improved markdown
if '\\n' in line:
line = line.replace('\\n', '')
# Skip the main title and metadata
if line.startswith('# Project Victoria') or line.startswith('**Generated on**') or line.startswith('**Total Warranties'):
i += 1
continue
# Skip table of contents section
if line == '## Table of Contents':
skip_toc = True
i += 1
continue
elif skip_toc and line.startswith('## ') and line != '## Table of Contents' and 'References' not in line:
skip_toc = False
elif skip_toc:
i += 1
continue
# Handle References section
if line == '## References':
in_references = True
in_disclosure = False
doc.add_heading('References', 1)
i += 1
continue
# Handle content in References section
if in_references:
if line.startswith('*The following documents') or line.startswith('*Total references:') or line == '---':
# Reference section header or footer
p = doc.add_paragraph(clean_text_for_xml(line))
p.paragraph_format.space_after = Inches(0.1)
elif line.startswith('**[') and ']**' in line:
# Reference entry like **[1]** Source Name
ref_match = re.match(r'\*\*\[(\d+)\]\*\*\s*(.*)', line)
if ref_match:
ref_num = ref_match.group(1)
source_text = ref_match.group(2)
# Create reference entry with proper formatting
p = doc.add_paragraph()
p.paragraph_format.space_before = Inches(0.1)
p.paragraph_format.space_after = Inches(0.05)
run = p.add_run(f"[{ref_num}] ")
run.bold = True
run.font.size = Inches(0.12) # Slightly larger for reference numbers
p.add_run(clean_text_for_xml(source_text))
elif line.startswith(' *Content preview:'):
# Content preview line with improved formatting
preview_text = line.replace(' *Content preview:', '').strip()
p = doc.add_paragraph()
run = p.add_run("Content preview: ")
run.italic = True
run.font.size = Inches(0.1)
content_run = p.add_run(clean_text_for_xml(preview_text))
content_run.font.size = Inches(0.09)
p.paragraph_format.left_indent = Inches(0.5)
p.paragraph_format.space_after = Inches(0.02)
elif line.startswith(' *Document type:'):
# Document type line
doc_type = line.replace(' *Document type:', '').strip()
p = doc.add_paragraph()
run = p.add_run("Document type: ")
run.italic = True
run.font.size = Inches(0.1)
p.add_run(clean_text_for_xml(doc_type))
p.paragraph_format.left_indent = Inches(0.5)
p.paragraph_format.space_after = Inches(0.02)
elif line.startswith(' *Date:'):
# Date line
date_text = line.replace(' *Date:', '').strip()
p = doc.add_paragraph()
run = p.add_run("Date: ")
run.italic = True
run.font.size = Inches(0.1)
p.add_run(clean_text_for_xml(date_text))
p.paragraph_format.left_indent = Inches(0.5)
p.paragraph_format.space_after = Inches(0.02)
elif line.startswith(' *Source location:'):
# Source location line
source_text = line.replace(' *Source location:', '').strip()
p = doc.add_paragraph()
run = p.add_run("Source location: ")
run.italic = True
run.font.size = Inches(0.1)
p.add_run(clean_text_for_xml(source_text))
p.paragraph_format.left_indent = Inches(0.5)
p.paragraph_format.space_after = Inches(0.02)
elif line.startswith(' *Author:'):
# Author line
author_text = line.replace(' *Author:', '').strip()
p = doc.add_paragraph()
run = p.add_run("Author: ")
run.italic = True
run.font.size = Inches(0.1)
p.add_run(clean_text_for_xml(author_text))
p.paragraph_format.left_indent = Inches(0.5)
p.paragraph_format.space_after = Inches(0.02)
elif line.startswith(' *Category:'):
# Category line
category_text = line.replace(' *Category:', '').strip()
p = doc.add_paragraph()
run = p.add_run("Category: ")
run.italic = True
run.font.size = Inches(0.1)
p.add_run(clean_text_for_xml(category_text))
p.paragraph_format.left_indent = Inches(0.5)
p.paragraph_format.space_after = Inches(0.02)
elif line.startswith(' *Relevance:'):
# Relevance line
relevance_text = line.replace(' *Relevance:', '').strip()
p = doc.add_paragraph()
run = p.add_run("Relevance: ")
run.italic = True
run.font.size = Inches(0.1)
p.add_run(clean_text_for_xml(relevance_text))
p.paragraph_format.left_indent = Inches(0.5)
p.paragraph_format.space_after = Inches(0.02)
elif line.startswith(' *Summary:'):
# Summary line
summary_text = line.replace(' *Summary:', '').strip()
p = doc.add_paragraph()
run = p.add_run("Summary: ")
run.italic = True
run.font.size = Inches(0.1)
summary_run = p.add_run(clean_text_for_xml(summary_text))
summary_run.font.size = Inches(0.09)
p.paragraph_format.left_indent = Inches(0.5)
p.paragraph_format.space_after = Inches(0.05)
elif line.startswith(' ') and line.strip():
# Other indented content in references
content = line.strip()
if content:
p = doc.add_paragraph(clean_text_for_xml(content))
p.paragraph_format.left_indent = Inches(0.5)
elif line.strip() and not line.startswith('**['):
# Regular text in references section
doc.add_paragraph(clean_text_for_xml(line))
i += 1
continue
# Handle section headers (## 1.1(a) - Title) - Level 1 headings
if line.startswith('## ') and ' - ' in line and not in_references:
current_section = line[3:] # Remove '## '
heading = doc.add_heading(clean_text_for_xml(current_section), 1)
in_disclosure = False
in_references = False
# Add section metadata if present
j = i + 1
while j < len(lines) and lines[j].strip():
next_line = lines[j].strip()
if next_line.startswith('**Section**:') or next_line.startswith('**Source Documents Found**:'):
p = doc.add_paragraph()
if next_line.startswith('**') and '**:' in next_line:
# Handle bold text with colons - remove markdown formatting
parts = next_line.split(':', 1)
if parts[0].startswith('**') and parts[0].endswith('**'):
label = parts[0][2:-2] # Remove ** markers
run = p.add_run(label + ':')
run.bold = True
if len(parts) > 1 and parts[1].strip():
p.add_run(' ' + clean_text_for_xml(parts[1].strip()))
else:
p.add_run(clean_text_for_xml(next_line))
else:
p.add_run(clean_text_for_xml(next_line))
j += 1
elif next_line.startswith('###'):
break
else:
j += 1
i = j - 1
# Handle subsection headers (### Warranty Text, ### Disclosure) - Level 2 headings
elif line.startswith('### '):
subsection = line[4:] # Remove '### '
doc.add_heading(clean_text_for_xml(subsection), 2)
if subsection == 'Disclosure':
in_disclosure = True
else:
in_disclosure = False
# Add a separator between warranties when starting a new warranty section
if subsection == 'Warranty Text' and current_section:
# Add some spacing before new warranty
doc.add_paragraph()
separator = doc.add_paragraph("─" * 80)
separator.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
separator.paragraph_format.space_before = Inches(0.1)
separator.paragraph_format.space_after = Inches(0.1)
doc.add_paragraph()
# Handle content within disclosure sections
elif in_disclosure:
# Handle headers within disclosure content - proper hierarchy
if line.startswith('# ') and not line.startswith('## '):
# Remove # and create Level 3 heading
content = line[2:].strip()
doc.add_heading(clean_text_for_xml(content), 3)
elif line.startswith('## ') and not line.startswith('### '):
# Remove ## and create Level 4 heading
content = line[3:].strip()
doc.add_heading(clean_text_for_xml(content), 4)
elif line.startswith('### ') and not line.startswith('#### '):
# Remove ### and create Level 5 heading
content = line[4:].strip()
doc.add_heading(clean_text_for_xml(content), 5)
elif line.startswith('#### ') and not line.startswith('##### '):
# Remove #### and create Level 6 heading
content = line[5:].strip()
doc.add_heading(clean_text_for_xml(content), 6)
elif line.startswith('##### '):
# Remove ##### and create Level 7 heading
content = line[6:].strip()
doc.add_heading(clean_text_for_xml(content), 7)
# Handle bold text that looks like headers - remove ** formatting
elif line.startswith('**') and line.endswith('**') and len(line) > 4 and ':' not in line:
# Check if it's likely a heading (short and capitalized)
content = line[2:-2] # Remove ** markers
if len(content.split()) <= 6 and (content[0].isupper() or content.isupper()):
doc.add_heading(clean_text_for_xml(content), 6)
else:
p = doc.add_paragraph()
run = p.add_run(clean_text_for_xml(content))
run.bold = True
# Handle bold text with colons (**label**: content) - remove ** formatting
elif line.startswith('**') and '**:' in line:
p = doc.add_paragraph()
# Find the first occurrence of **:
colon_pos = line.find('**:')
if colon_pos > 0:
label = line[2:colon_pos] # Remove leading ** and get label
remaining = line[colon_pos + 3:] # Get content after **:
run = p.add_run(label + ':')
run.bold = True
if remaining.strip():
p.add_run(' ' + clean_text_for_xml(remaining.strip()))
else:
# Fallback for malformed bold text
p.add_run(clean_text_for_xml(line))
# Handle list items
elif line.startswith('- '):
# Handle inline references in list items
list_text = clean_text_for_xml(line[2:])
p = doc.add_paragraph(list_text, style='List Bullet')
# Make inline references italic
format_inline_references(p)
# Handle numbered lists
elif re.match(r'^\d+\.', line):
list_text = clean_text_for_xml(line)
p = doc.add_paragraph(list_text, style='List Number')
format_inline_references(p)
# Handle lines that end with colon (potential section headers)
elif line.endswith(':') and not line.startswith('-') and len(line.split()) <= 6:
# Check if it looks like a section header
content = line[:-1].strip()
if content and (content[0].isupper() or content.count(' ') <= 3):
doc.add_heading(clean_text_for_xml(content), 7)
else:
p = doc.add_paragraph(clean_text_for_xml(line))
format_inline_references(p)
# Regular paragraphs (skip separator lines)
elif line and not line.startswith('---') and line != '```':
p = doc.add_paragraph(clean_text_for_xml(line))
format_inline_references(p)
# Handle non-disclosure content (like warranty text)
else:
# Handle bold text with colons (**text**: content) - remove ** formatting
if line.startswith('**') and ':' in line and not line.endswith('**'):
p = doc.add_paragraph()
# Find the colon position
colon_pos = line.find(':')
if colon_pos > 0:
# Check if this is a **label**: format
if line.startswith('**') and line[:colon_pos].endswith('**'):
label = line[2:colon_pos-2] # Remove ** markers
remaining = line[colon_pos+1:] # Get content after :
run = p.add_run(label + ':')
run.bold = True
if remaining.strip():
p.add_run(' ' + clean_text_for_xml(remaining.strip()))
else:
# Not a proper **label**: format, treat as regular text
p.add_run(clean_text_for_xml(line))
else:
p.add_run(clean_text_for_xml(line))
elif line and not line.startswith('---'):
# Check if we're in a warranty text section (not in disclosure)
if not in_disclosure and current_section:
# Use quote style for warranty text
p = doc.add_paragraph(clean_text_for_xml(line))
try:
p.style = 'Quote'
p.paragraph_format.left_indent = Inches(0.5)
p.paragraph_format.space_before = Inches(0.05)
p.paragraph_format.space_after = Inches(0.05)
except:
# If Quote style doesn't exist, just use regular paragraph with indent
p.paragraph_format.left_indent = Inches(0.5)
else:
p = doc.add_paragraph(clean_text_for_xml(line))
format_inline_references(p)
i += 1
doc.save(output_path)
logger.info(f"Enhanced Word document saved: {output_path}")
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
markdown_content |
- | - | positional_or_keyword |
output_path |
- | - | positional_or_keyword |
Parameter Details
markdown_content: String containing the markdown-formatted content to convert. Expected to include Project Victoria warranty sections with specific markdown patterns including warranty IDs, titles, warranty text sections, disclosure sections, and references. Should contain headers marked with #, ##, ###, bold text with **, list items with -, and reference citations in [n] format.
output_path: String or Path object specifying the file path where the generated Word document should be saved. Should include the .docx extension. The directory must exist or be writable.
Return Value
This function returns None. It produces a side effect by creating and saving a Word document to the specified output_path. The document is saved using the python-docx library's save() method.
Dependencies
python-docxreloggingpathlibdatetime
Required Imports
import re
import logging
from pathlib import Path
from datetime import datetime
from docx import Document
from docx.shared import Inches
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from docx.enum.style import WD_STYLE_TYPE
Usage Example
import logging
from docx import Document
from docx.shared import Inches
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from datetime import datetime
import re
# Setup logger
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
# Define required helper functions
def extract_total_references(markdown_content):
refs = re.findall(r'\*\*\[(\d+)\]\*\*', markdown_content)
return len(set(refs))
def extract_warranty_sections(markdown_content):
sections = []
pattern = r'## (\d+\.\d+\([a-z]\)) - (.+)'
matches = re.finditer(pattern, markdown_content)
for match in matches:
sections.append({'id': match.group(1), 'title': match.group(2)})
return sections
def clean_text_for_xml(text):
return text.replace('**', '').replace('*', '').strip()
def format_inline_references(paragraph):
text = paragraph.text
refs = re.findall(r'\[(\d+)\]', text)
for ref in refs:
for run in paragraph.runs:
if f'[{ref}]' in run.text:
run.italic = True
# Example markdown content
markdown_content = '''# Project Victoria - Warranty Disclosures
## 1.1(a) - Product Quality Warranty
### Warranty Text
The product shall be free from defects.
### Disclosure
This warranty covers manufacturing defects [1].
## References
**[1]** Manufacturing Standards Document
'''
# Create the Word document
output_path = 'warranty_disclosures.docx'
create_enhanced_word_document(markdown_content, output_path)
print(f'Document created: {output_path}')
Best Practices
- Ensure all helper functions (extract_total_references, extract_warranty_sections, clean_text_for_xml, format_inline_references) are defined before calling this function
- Configure a logger instance before use to capture document generation status messages
- Validate that the markdown_content follows the expected format with proper section markers (##, ###) and reference formatting
- Ensure the output directory exists and has write permissions before calling the function
- The markdown content should follow Project Victoria conventions with warranty IDs in format X.X(x) and proper section hierarchy
- Large documents may take significant time to process due to complex formatting operations
- Test with sample markdown content first to ensure helper functions are correctly implemented
- The function modifies document styles and formatting extensively - ensure python-docx version compatibility
- Consider memory usage for very large markdown documents with many references and sections
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
function create_enhanced_word_document 94.4% similar
-
function main_v1 84.4% similar
-
function create_word_report_improved 82.6% similar
-
function create_word_report 81.6% similar
-
function main_v15 74.3% similar