Information Technology Grimoire

Version .0.0.1

IT Notes from various projects because I forget, and hopefully they help you too.

Python3 Markdown/HTML to Docx

Use Case

The GPT I wrote gave me answers in markdown format. I wanted docx format, and I wanted it in a specific way. The answer was to create a script that would convert these markdown outputs for me.

I Keep markdown notes in joplin, and wanted it all converted to docx so I could edit it and share it on google docs. It would be useful for creating a book, which is what I used it for because it creates a properly formatted document structure.

The Scripts

Install the prereqs

This is the requirements.txt made with pip freeze from a working version. I recommend a requirements.txt file that you install a venv and load these modules into that venv. Do not load these into your core python install in case they clobber some existing modules. Virtual environments are the only way I recommned.

beautifulsoup4==4.12.3
bs4==0.0.2
lxml==5.2.2
markdown2==2.4.13
mistune==3.0.2
pillow==10.3.0
python-docx==1.1.2
soupsieve==2.5
ttkbootstrap==1.10.1
typing_extensions==4.12.1

Then python -m pip install -r requirements.txt

GUI Markdown to Docx

md2docx screenshot

If you have a block of markdown and want it converted to docx, this gui app will do that for you.

import tkinter as tk
from tkinter import ttk
import ttkbootstrap as ttkb
from docx import Document
from docx.shared import Pt, RGBColor
from bs4 import BeautifulSoup
import markdown2
import mistune
import re
from config import FONT_NAME, FONT_SIZE, H2_FONT_SIZE, H3_FONT_SIZE, H2_COLOR, H3_COLOR, LIST_INDENT

def sanitize_filename(filename):
    # Replace invalid characters with space and consolidate multiple spaces
    sanitized = re.sub(r'[^A-Za-z0-9 -]', ' ', filename)
    sanitized = re.sub(r'\s+', ' ', sanitized).strip()
    return sanitized

def validate_markdown(markdown_text):
    try:
        mistune.create_markdown()(markdown_text)
        return True, "Markdown is valid."
    except Exception as e:
        return False, f"Markdown is invalid: {e}"

def add_paragraph_with_style(document, text, style=None, indent=0, color=None, font_name=FONT_NAME, font_size=FONT_SIZE, bold=False):
    p = document.add_paragraph()
    if style:
        p.style = document.styles[style]
    if indent:
        p.paragraph_format.left_indent = Pt(indent)
    run = p.add_run(text)
    font = run.font
    font.name = font_name
    font.size = Pt(font_size)
    font.bold = bold
    if color:
        font.color.rgb = color
    return p

def add_bold_run(paragraph, text, font_name=FONT_NAME, font_size=FONT_SIZE, color=None):
    run = paragraph.add_run(text)
    run.bold = True
    font = run.font
    font.name = font_name
    font.size = Pt(font_size)
    if color:
        font.color.rgb = color
    return run

def add_run(paragraph, text, font_name=FONT_NAME, font_size=FONT_SIZE, color=None):
    run = paragraph.add_run(text)
    font = run.font
    font.name = font_name
    font.size = Pt(font_size)
    if color:
        font.color.rgb = color
    return run

def process_html_element(element, paragraph):
    if element.name is None:
        add_run(paragraph, element)
    elif element.name == 'strong' or element.name == 'b':
        add_bold_run(paragraph, element.text)
    else:
        for child in element.children:
            process_html_element(child, paragraph)

def convert_markdown_to_docx(markdown_text, output_file):
    try:
        document = Document()
        html = markdown2.markdown(markdown_text, extras=["tables"])

        soup = BeautifulSoup(html, 'html.parser')
        
        for element in soup.children:
            if element.name == 'h2':
                p = add_paragraph_with_style(document, '', style='Heading 2', font_size=H2_FONT_SIZE, bold=True)
                add_run(p, element.text, font_size=H2_FONT_SIZE, color=RGBColor(*H2_COLOR))  # Dark Blue
            elif element.name == 'h3':
                p = add_paragraph_with_style(document, '', style='Heading 3', indent=LIST_INDENT, font_size=H3_FONT_SIZE, bold=True)
                add_run(p, element.text, font_size=H3_FONT_SIZE, color=RGBColor(*H3_COLOR))  # Dark Red
            elif element.name == 'ul':
                for li in element.find_all('li'):
                    p = add_paragraph_with_style(document, '', indent=LIST_INDENT)
                    add_run(p, '• ')
                    for child in li.children:
                        process_html_element(child, p)
            elif element.name == 'p':
                p = add_paragraph_with_style(document, '', indent=LIST_INDENT)
                for child in element.children:
                    process_html_element(child, p)
            elif element.name == 'table':
                print("Processing a table")  # Debug print
                rows = element.find_all('tr')
                max_cols = max(len(row.find_all(['td', 'th'])) for row in rows)
                table = document.add_table(rows=len(rows), cols=max_cols)
                for i, row in enumerate(rows):
                    cells = row.find_all(['td', 'th'])
                    for j, cell in enumerate(cells):
                        p = table.cell(i, j).paragraphs[0]
                        for child in cell.children:
                            process_html_element(child, p)
                        print(f"Added cell [{i},{j}] with text: {cell.text}")  # Debug print
                    for j in range(len(cells), max_cols):
                        table.cell(i, j).text = ""
                        print(f"Added empty cell [{i},{j}]")  # Debug print

        document.save(output_file)
        print("Document saved successfully!")  # Debug print
    except Exception as e:
        print(f"Error in convert_markdown_to_docx: {e}")  # Debug print
        raise

def create_docx():
    markdown_text = text_area.get("1.0", tk.END).strip()
    output_file = file_name_entry.get().strip()
    if markdown_text and output_file:
        try:
            convert_markdown_to_docx(markdown_text, output_file)
            status_message = f"DOCX file '{output_file}' created successfully!"
            status_label.config(text=status_message, foreground="green")
            print(status_message)
        except Exception as e:
            status_message = f"Error: {e}"
            status_label.config(text=status_message, foreground="red")
            print(status_message)
    else:
        status_message = "Please provide markdown text and an output file name."
        status_label.config(text=status_message, foreground="red")
        print(status_message)

def parse_markdown():
    markdown_text = text_area.get("1.0", tk.END).strip()
    if markdown_text:
        is_valid, message = validate_markdown(markdown_text)
        if is_valid:
            status_label.config(text=message, foreground="green")
            for line in markdown_text.split('\n'):
                if line.startswith('## '):
                    file_name = sanitize_filename(line[3:].replace(':', ' -').strip()) + '.docx'
                    file_name_entry.delete(0, tk.END)
                    file_name_entry.insert(0, file_name)
                    break
        else:
            status_label.config(text=message, foreground="red")
        print(message)

app = ttkb.Window(themename="superhero")

app.title("Markdown to DOCX Converter")

text_area = tk.Text(app, wrap="word", height=20)
text_area.pack(padx=10, pady=10, fill="both", expand=True)

file_name_label = ttkb.Label(app, text="Output File Name:")
file_name_label.pack(pady=(10, 0))

file_name_entry = ttkb.Entry(app)
file_name_entry.pack(pady=(0, 10), fill="x", padx=10)

parse_button = ttkb.Button(app, text="Parse", command=parse_markdown)
parse_button.pack(pady=10)

convert_button = ttkb.Button(app, text="Create DOCX", command=create_docx)
convert_button.pack(pady=10)

status_label = ttkb.Label(app, text="")
status_label.pack(pady=(10, 0))

app.mainloop()

Combine Multiple HTML to Docx

I used this by exporting all of my notes on a topic into a folder, then running this script over the entire folder. It combined all of them into a single docx that is formatted exactly the way I wanted.

import os
import glob
import sys
from docx import Document
from docx.shared import Pt, RGBColor
from bs4 import BeautifulSoup
import markdown2
import re
from config import FONT_NAME, FONT_SIZE, H2_FONT_SIZE, H3_FONT_SIZE, H2_COLOR, H3_COLOR, LIST_INDENT

DEBUG = 0

def sanitize_filename(filename):
    sanitized = re.sub(r'[^A-Za-z0-9 -]', ' ', filename)
    sanitized = re.sub(r'\s+', ' ', sanitized).strip()
    return sanitized

def add_paragraph_with_style(document, text, style=None, indent=0, color=None, font_name=FONT_NAME, font_size=FONT_SIZE, bold=False):
    p = document.add_paragraph()
    if style:
        p.style = document.styles[style]
    if indent:
        p.paragraph_format.left_indent = Pt(indent)
    run = p.add_run(text)
    font = run.font
    font.name = font_name
    font.size = Pt(font_size)
    font.bold = bold
    if color:
        font.color.rgb = color
    return p

def add_bold_run(paragraph, text, font_name=FONT_NAME, font_size=FONT_SIZE, color=None):
    run = paragraph.add_run(text)
    run.bold = True
    font = run.font
    font.name = font_name
    font.size = Pt(font_size)
    if color:
        font.color.rgb = color
    return run

def add_run(paragraph, text, font_name=FONT_NAME, font_size=FONT_SIZE, color=None):
    run = paragraph.add_run(text)
    font = run.font
    font.name = font_name
    font.size = Pt(font_size)
    if color:
        font.color.rgb = color
    return run

def process_html_to_docx(html, document):
    soup = BeautifulSoup(html, 'html.parser')
    body = soup.body if soup.body else soup

    if DEBUG:
        print(f"Processing HTML content...")
    
    for element in body.descendants:
        if element.name:
            if DEBUG:
                print(f"Processing element: {element.name}")
        if element.name == 'h2':
            p = add_paragraph_with_style(document, '', style='Heading 2', font_size=H2_FONT_SIZE, bold=True)
            add_run(p, element.text, font_size=H2_FONT_SIZE, color=RGBColor(*H2_COLOR))
        elif element.name == 'h3':
            p = add_paragraph_with_style(document, '', style='Heading 3', indent=LIST_INDENT, font_size=H3_FONT_SIZE, bold=True)
            add_run(p, element.text, font_size=H3_FONT_SIZE, color=RGBColor(*H3_COLOR))
        elif element.name == 'ul':
            for li in element.find_all('li'):
                p = add_paragraph_with_style(document, '', indent=LIST_INDENT)
                add_run(p, '• ')
                add_run(p, li.text)
        elif element.name == 'p':
            p = add_paragraph_with_style(document, element.text, indent=LIST_INDENT)
        elif element.name == 'table':
            rows = element.find_all('tr')
            max_cols = max(len(row.find_all(['td', 'th'])) for row in rows)
            table = document.add_table(rows=len(rows), cols=max_cols)
            for i, row in enumerate(rows):
                cells = row.find_all(['td', 'th'])
                for j, cell in enumerate(cells):
                    p = table.cell(i, j).paragraphs[0]
                    add_run(p, cell.text)

def convert_markdown_to_docx(markdown_text, document):
    html = markdown2.markdown(markdown_text, extras=["tables"])
    process_html_to_docx(html, document)

def process_folder(folder_path):
    document = Document()
    files = glob.glob(os.path.join(folder_path, '*'))
    
    for file in files:
        if file.endswith('.md'):
            with open(file, 'r', encoding='utf-8') as f:
                markdown_text = f.read()
            convert_markdown_to_docx(markdown_text, document)
        elif file.endswith('.html'):
            with open(file, 'r', encoding='utf-8') as f:
                html_text = f.read()
            process_html_to_docx(html_text, document)
    
    output_file = os.path.join(folder_path, 'combined.docx')
    document.save(output_file)
    print(f"Document saved successfully as {output_file}")

if __name__ == "__main__":
    if len(sys.argv) != 2:
        script_name = os.path.basename(sys.argv[0])
        print(f"Usage: python {script_name} <folder_path>")
        sys.exit(1)
    
    folder_path = sys.argv[1]
    if not os.path.isdir(folder_path):
        print(f"The specified folder does not exist: {folder_path}")
        sys.exit(1)
    
    process_folder(folder_path)

Docx Output Example

The script above created a 98 page docx, which is the basis of my book. It has 20 chapters, and I outlined each with a similar format. Here is a snippet from chapter 2:

md2docx screenshot