"""
man
~~~
A parser for documents formatted with the man troff macros.
"""
from dataclasses import dataclass, field
from textwrap import wrap
from typing import Iterable, Optional, Sequence
from blessed import Terminal
# Base token classes.
@dataclass
class Token:
"""A superclass for lexical tokens."""
def process_next(self, line: str) -> bool:
"""Process the next line of text."""
return True
def parse(
self,
width: Optional[int] = None,
margin: int = 0,
indent: int = 4
) -> tuple[str, int, int]:
"""Parse the token into text."""
return str(self), margin, indent
def _parse_escapes(self, text: str) -> str:
"""Transform escape sequences into their character equivalents."""
if '\\' not in text:
return text
text = text.replace('\\.', '.')
text = text.replace('\\\\', '\\')
return text
@dataclass
class NonPrinting(Token):
def parse(
self,
width: Optional[int] = None,
margin: int = 0,
indent: int = 4
) -> tuple[str, int, int]:
"""Parse the token into text."""
return '', margin, indent
@dataclass
class Text(Token):
text: str
def __str__(self) -> str:
return self._parse_escapes(self.text)
@dataclass
class AlternatingFontStyleToken(Text):
text: str = ''
def _alternate_style(self, style_a: str, style_b: str) -> str:
term = Terminal()
words = self.text.split(' ')
style = style_a
formatteds = []
for word in words:
formatted = f'{style}{word}{term.normal}'
formatteds.append(formatted)
if style == style_a:
style = style_b
else:
style = style_a
return ' '.join(formatteds)
@dataclass
class MultilineFontStyleToken(Text):
text: str = ''
@dataclass
class ContainerToken(Token):
"""A superclass for tokens that contain other tokens."""
def _parse_contents(
self,
contents: list[Token],
width: Optional[int] = None,
margin: int = 0,
indent: int = 4
) -> str:
"""Parse the text tokens of the token."""
# Remove pre-existing hard wrapping.
text_like = (Text, Option, EmailAddress, Url)
if all(isinstance(token, text_like) for token in contents):
lines = [token.parse(width)[0].rstrip() for token in contents]
paragraph = ' '.join(line for line in lines)
# Wrap the text for the width, margin, and indent.
wrapped = [paragraph,]
if width is not None:
term = Terminal()
wrap_width = width - margin - indent
wrapped = term.wrap(paragraph, wrap_width)
# Add indentation and return.
lead = ' ' * (margin + indent)
text = '\n'.join(f'{lead}{line}' for line in wrapped)
return f'{text}\n'
else:
text = ''
for token in contents:
parsed, *_ = token.parse(width, margin, indent)
text += parsed
return f'{text.rstrip()}\n'
# Document structure tokens.
@dataclass
class Example(Token):
contents: list[Text] = field(default_factory=list)
def process_next(self, line: str) -> bool:
"""Process the next line of text."""
if line.startswith('.EE'):
return True
token = Text(line)
self.contents.append(token)
return False
def parse(
self,
width: Optional[int] = None,
margin: int = 0,
indent: int = 4
) -> tuple[str, int, int]:
"""Parse the token into text."""
act_width = width
if width is not None:
act_width = width - margin - indent
lead = ' ' * (margin + indent)
text = f'{lead}{self.contents[0].text[:act_width]}\n'
for token in self.contents[1:]:
text = f'{text}{lead}{token.text[:act_width]}\n'
return text, margin, indent
@dataclass
class RelativeIndentEnd(NonPrinting):
indent: str = '1'
def parse(
self,
width: Optional[int] = None,
margin: int = 0,
indent: int = 4
) -> tuple[str, int, int]:
return '', margin - int(self.indent), indent
@dataclass
class RelativeIndentStart(NonPrinting):
indent: str = '1'
def parse(
self,
width: Optional[int] = None,
margin: int = 0,
indent: int = 4
) -> tuple[str, int, int]:
return '', margin + int(self.indent), indent
@dataclass
class Section(ContainerToken):
heading_text: str = ''
contents: list[Token] = field(default_factory=list)
def process_next(self, line: str) -> bool:
"""Process the next line of text."""
stripped = line.rstrip()
# If a heading wasn't given as a parameter of the macro, the
# first line of text after the macro is the heading.
if not self.heading_text:
self.heading_text = stripped
return False
# Weed out blank lines.
if not stripped:
return False
# Once there is a header, the next lines are the contents of
# the first paragraph of the section.
token: Optional[Token] = _process_font_style_macro(
line,
self.contents
)
if token:
self.contents.append(token)
return False
# If it wasn't the head or contents, then close the macro.
return True
def parse(
self,
width: Optional[int] = None,
margin: int = 0,
indent: int = 4
) -> tuple[str, int, int]:
"""Parse the token into text."""
margin = 0
indent = 4
term = Terminal()
header = f'{term.bold}{self.heading_text}{term.normal}\n'
contents = self._parse_contents(self.contents, width, margin, indent)
text = f'{header}{contents}\n'
return text, margin, indent
@dataclass
class Subheading(ContainerToken):
subheading_text: str = ''
contents: list[Token] = field(default_factory=list)
def process_next(self, line: str) -> bool:
"""Process the next line of text."""
stripped = line.rstrip()
# If a heading wasn't given as a parameter of the macro, the
# first line of text after the macro is the heading.
if not self.subheading_text:
self.subheading_text = stripped
return False
# Weed out blank lines.
if not stripped:
return False
# Once there is a header, the next lines are the contents of
# the first paragraph of the section.
token: Optional[Token] = _process_font_style_macro(
line,
self.contents
)
if token:
self.contents.append(token)
return False
# If it wasn't the head or contents, then close the macro.
return True
def parse(
self,
width: Optional[int] = None,
margin: int = 0,
indent: int = 4
) -> tuple[str, int, int]:
"""Parse the token into text."""
margin = 0
indent = 4
term = Terminal()
head = f' {term.bold}{self.subheading_text}{term.normal}\n'
contents = self._parse_contents(self.contents, width, margin, indent)
text = f'{head}{contents}'
return f'{text}\n', margin, indent
@dataclass
class Title(Token):
title: str
section: str = ''
footer_middle: str = ''
footer_inside: str = ''
header_middle: str = ''
def __str__(self) -> str:
if self.section:
return f'{self.title.upper()}({self.section})'
return self.title.upper()
def footer(self, width: Optional[int] = None) -> str:
l_text = self.footer_inside
m_text = self.footer_middle
r_text = str(self)
total_text = len(l_text) + len(m_text) + len(r_text)
if width is None:
width = total_text + 2
total_gap = width - total_text
l_gap = ' ' * (total_gap // 2)
r_gap = ' ' * (-(-total_gap // 2))
text = f'\n\n\n{l_text}{l_gap}{m_text}{r_gap}{r_text}\n'
return text
def parse(
self,
width: Optional[int] = None,
margin: int = 0,
indent: int = 4
) -> tuple[str, int, int]:
title = str(self)
total_text = len(title) * 2 + len(self.header_middle)
if width is None:
width = total_text + 2
total_gap = width - total_text
l_gap = ' ' * (total_gap // 2)
r_gap = ' ' * (-(-total_gap // 2))
text = f'{title}{l_gap}{self.header_middle}{r_gap}{title}\n\n\n\n'
return text, margin, indent
# Paragraph tokens.
@dataclass
class Paragraph(ContainerToken):
contents: list[Token] = field(default_factory=list)
def process_next(self, line: str) -> bool:
"""Process the next line."""
if not line:
return False
token: Optional[Token] = _process_font_style_macro(
line,
self.contents
)
if token:
self.contents.append(token)
return False
return True
def parse(
self,
width: Optional[int] = None,
margin: int = 0,
indent: int = 4
) -> tuple[str, int, int]:
"""Parse the token into text."""
indent = 4
parsed = self._parse_contents(self.contents, width, margin, indent)
return f'{parsed}\n', margin, indent
@dataclass
class IndentedParagraph(ContainerToken):
tag: str = ''
indent: str = ''
contents: list[Token] = field(default_factory=list)
def process_next(self, line: str) -> bool:
"""Process the next line."""
if not line:
return False
token: Optional[Token] = _process_font_style_macro(line)
if token:
self.contents.append(token)
return False
return True
def parse(
self,
width: Optional[int] = None,
margin: int = 0,
indent: int = 4
) -> tuple[str, int, int]:
"""Parse the token into text."""
# .IP doesn't change the margin, but it will change the indent.
if self.indent:
indent = int(self.indent)
# Build the paragraph.
contents = self._parse_contents(self.contents, width, margin, indent)
# If the paragraph is tagged, add the tag.
tag = self._parse_escapes(self.tag)
lead = ' ' * margin
if len(self.tag) < indent:
gap = margin + indent
text = f'{lead}{tag: <{indent}}{contents[gap:]}\n'
else:
text = f'{lead}{tag}\n{contents}\n'
# Return the text, margin, and new indent.
return text, margin, indent
@dataclass
class TaggedParagraph(ContainerToken):
indent: str = ''
tag: list[str] = field(default_factory=list)
contents: list[Token] = field(default_factory=list)
_tag_flag: bool = False
def process_next(self, line: str) -> bool:
"""Process the next line."""
token: Optional[Token] = None
end = False
if not self.tag:
self.tag.append(line.rstrip())
elif line.startswith('.TQ'):
self._tag_flag = True
elif self._tag_flag:
self.tag.append(line.rstrip())
self._tag_flag = False
elif line:
token = _process_font_style_macro(line, self.contents)
if line and not token:
end = True
if token:
self.contents.append(token)
return end
def parse(
self,
width: Optional[int] = None,
margin: int = 0,
indent: int = 4
) -> tuple[str, int, int]:
"""Parse the token into text."""
# .TP doesn't change the margin, but it will change the indent.
if self.indent:
indent = int(self.indent)
# Build the paragraph.
contents = self._parse_contents(self.contents, width, margin, indent)
# If the paragraph has multiple tags, add all tags but the
# last one.
parsed_tags = [self._parse_escapes(tag) for tag in self.tag]
lead = ' ' * margin
tags = ''
for tag in parsed_tags[:-1]:
tags += f'{lead}{tag}\n'
# Add the last or only tag.
tag = parsed_tags[-1]
if len(parsed_tags) >= 1 and len(tag) < indent:
gap = margin + indent
text = f'{tags}{lead}{tag: <{indent}}{contents[gap:]}\n'
else:
text = f'{tags}{lead}{tag}\n{contents}\n'
# Return the text, margin, and new indent.
return text, margin, indent
# Command synopsis tokens.
@dataclass
class Option(Token):
option_name: str
option_argument: str = ''
def __str__(self) -> str:
term = Terminal()
if self.option_argument:
return (
f'[{term.bold}{self.option_name}{term.normal} '
f'{term.underline}{self.option_argument}{term.normal}]'
)
return f'[{term.bold}{self.option_name}{term.normal}]'
@dataclass
class Synopsis(ContainerToken):
command: str
contents: list[Token] = field(default_factory=list)
def process_next(self, line: str) -> bool:
token: Optional[Token] = None
if (
is_macro_type(STRUCTURE_TOKENS, line)
or is_macro_type(PARAGRAPH_TOKENS, line)
):
return True
if line.startswith('.YS'):
pass
elif line.startswith('.OP'):
args = line.rstrip().split(' ')
token = Option(*args[1:])
self.contents.append(token)
elif line.startswith('.SY'):
args = line.rstrip().split(' ')
token = Synopsis(args[1])
self.contents.append(token)
return False
def parse(
self,
width: Optional[int] = None,
margin: int = 0,
indent: int = 4
) -> tuple[str, int, int]:
"""Parse the token into text."""
if any(isinstance(token, Synopsis) for token in self.contents):
text = self._parse_multiple_synopsis(width, margin, indent)
return text, margin, indent
text = self._parse_single_synopsis(width, margin, indent)
return text, margin, indent
def _parse_multiple_synopsis(
self,
width: Optional[int] = None,
margin: int = 0,
indent: int = 0
) -> str:
# Split contents into multiple synopses.
synopses = []
synopsis = Synopsis(self.command)
for token in self.contents:
if isinstance(token, Synopsis):
synopses.append(synopsis)
synopsis = token
else:
synopsis.contents.append(token)
else:
synopses.append(synopsis)
# Get the text for each synopsis, concatenate, and return.
text = ''
for synopsis in synopses:
parsed, *_ = synopsis.parse(width, margin, indent)
text = f'{text}{parsed.rstrip()}\n'
return f'{text}\n'
def _parse_single_synopsis(
self,
width: Optional[int] = None,
margin: int = 0,
indent: int = 4
) -> str:
# Build the command label.
term = Terminal()
lead = ' ' * (margin + indent)
command = f'{lead}{term.bold}{self.command}{term.normal}'
# Build the options list.
opt_indent = len(self.command) + 1 + margin + indent
options = self._parse_contents(
self.contents,
width,
margin,
opt_indent
)
# Build the final output and return.
text = f'{command} {options[opt_indent:]}\n'
return text
# Hyperlink and email tokens.
@dataclass
class EmailAddress(ContainerToken):
address: str
contents: list[Token] = field(default_factory=list)
punctuation: str = ''
def process_next(self, line: str) -> bool:
"""Process the next line."""
if (
is_macro_type(STRUCTURE_TOKENS, line)
or is_macro_type(PARAGRAPH_TOKENS, line)
or is_macro_type(COMMAND_SYNOPSIS_TOKENS, line)
):
return True
if line.startswith('.ME'):
args = line.rstrip().split(' ')
if len(args) > 1:
self.punctuation = args[1]
elif line:
token = Text(line.rstrip())
self.contents.append(token)
return False
def parse(
self,
width: Optional[int] = None,
margin: int = 0,
indent: int = 4
) -> tuple[str, int, int]:
"""Parse the token into text."""
term = Terminal()
addr = f'mailto:{self.address}'
text = self._parse_contents(self.contents, None, 0, 0).rstrip()
link = term.link(addr, text)
return f'{link}{self.punctuation}', margin, indent
@dataclass
class Url(ContainerToken):
address: str
contents: list[Token] = field(default_factory=list)
punctuation: str = ''
def process_next(self, line: str) -> bool:
"""Process the next line."""
if (
is_macro_type(STRUCTURE_TOKENS, line)
or is_macro_type(PARAGRAPH_TOKENS, line)
or is_macro_type(COMMAND_SYNOPSIS_TOKENS, line)
):
return True
if line.startswith('.UE'):
args = line.rstrip().split(' ')
if len(args) > 1:
self.punctuation = args[1]
elif line:
token = Text(line.rstrip())
self.contents.append(token)
return False
def parse(
self,
width: Optional[int] = None,
margin: int = 0,
indent: int = 4
) -> tuple[str, int, int]:
"""Parse the token into text."""
term = Terminal()
text = self._parse_contents(self.contents, None, 0, 0).rstrip()
link = term.link(self.address, text)
return f'{link}{self.punctuation}', margin, indent
# Font style macros.
@dataclass
class Bold(MultilineFontStyleToken):
text: str = ''
def __str__(self) -> str:
term = Terminal()
return f'{term.bold}{self.text}{term.normal}'
@dataclass
class Italic(MultilineFontStyleToken):
text: str = ''
def __str__(self) -> str:
term = Terminal()
return f'{term.underline}{self.text}{term.normal}'
@dataclass
class Small(MultilineFontStyleToken):
text: str = ''
@dataclass
class SmallBold(MultilineFontStyleToken):
text: str = ''
def __str__(self) -> str:
term = Terminal()
return f'{term.bold}{self.text}{term.normal}'
# Alternating font style macros
@dataclass
class BoldItalic(AlternatingFontStyleToken):
text: str = ''
def __str__(self) -> str:
term = Terminal()
return self._alternate_style(term.bold, term.underline)
@dataclass
class BoldRoman(AlternatingFontStyleToken):
text: str = ''
def __str__(self) -> str:
term = Terminal()
return self._alternate_style(term.bold, '')
@dataclass
class ItalicBold(AlternatingFontStyleToken):
text: str = ''
def __str__(self) -> str:
term = Terminal()
return self._alternate_style(term.underline, term.bold)
@dataclass
class ItalicRoman(AlternatingFontStyleToken):
text: str = ''
def __str__(self) -> str:
term = Terminal()
return self._alternate_style(term.underline, '')
@dataclass
class RomanBold(AlternatingFontStyleToken):
text: str = ''
def __str__(self) -> str:
term = Terminal()
return self._alternate_style('', term.bold)
@dataclass
class RomanItalic(AlternatingFontStyleToken):
text: str = ''
def __str__(self) -> str:
term = Terminal()
return self._alternate_style('', term.underline)
# Other tokens.
@dataclass
class Empty(Text):
text: str = ''
# Token collections.
STRUCTURE_TOKENS: dict[str, Optional[type]] = {
'.ee': None,
'.ex': Example,
'.re': RelativeIndentEnd,
'.rs': RelativeIndentStart,
'.sh': Section,
'.ss': Subheading,
'.th': Title,
}
PARAGRAPH_TOKENS: dict[str, Optional[type]] = {
'.ip': IndentedParagraph,
'.lp': Paragraph,
'.p': Paragraph,
'.pp': Paragraph,
'.tp': TaggedParagraph,
}
COMMAND_SYNOPSIS_TOKENS: dict[str, Optional[type]] = {
'.sy': Synopsis,
'.op': Option,
'.ys': None,
}
# Lexer functions.
def _build_multiline_font_style_token(
class_: type,
line: str
) -> MultilineFontStyleToken:
token = class_()
if ' ' in line:
split_ = line.split(' ', 1)
token.text = split_[1]
return token
def _build_singleline_font_style_token(
class_: type,
line: str
) -> Text:
split_ = line.split(' ', 1)
return class_(split_[1])
def is_macro_type(macros: Iterable[str], line: str) -> bool:
"""Does the given line match a non-font style macro."""
for macro in macros:
folded = line.casefold()
if folded.startswith(macro.casefold()):
return True
return False
def _process_font_style_macro(
line: str,
contents: Optional[list[Token]] = None
) -> Optional[Token]:
"""Process a font style macro discovered while processing a
multiline macro.
"""
token: Optional[Token] = None
stripped = line.rstrip()
if (
is_macro_type(STRUCTURE_TOKENS, stripped)
or is_macro_type(PARAGRAPH_TOKENS, stripped)
or is_macro_type(COMMAND_SYNOPSIS_TOKENS, stripped)
):
pass
elif (
not stripped.startswith('.')
and contents
and isinstance(contents[-1], Text)
and not contents[-1].text
):
token = contents.pop()
if isinstance(token, MultilineFontStyleToken):
token.text = stripped
elif not stripped.startswith('.'):
token = Text(stripped)
elif stripped.startswith('.BI'):
token = _build_singleline_font_style_token(BoldItalic, stripped)
elif stripped.startswith('.BR'):
token = _build_singleline_font_style_token(BoldRoman, stripped)
elif stripped.startswith('.B'):
token = _build_multiline_font_style_token(Bold, stripped)
elif stripped.startswith('.IB'):
token = _build_singleline_font_style_token(ItalicBold, stripped)
elif stripped.startswith('.IR'):
token = _build_singleline_font_style_token(ItalicRoman, stripped)
elif stripped.startswith('.I'):
token = _build_multiline_font_style_token(Italic, stripped)
elif stripped.startswith('.RB'):
token = _build_singleline_font_style_token(RomanBold, stripped)
elif stripped.startswith('.RI'):
token = _build_singleline_font_style_token(RomanItalic, stripped)
elif stripped.startswith('.SB'):
token = _build_multiline_font_style_token(SmallBold, stripped)
elif stripped.startswith('.SM'):
token = _build_multiline_font_style_token(Small, stripped)
else:
token = Empty(stripped[1:])
return token
def lex(text: str) -> tuple[Token, ...]:
"""Lex the given document."""
lines = text.split('\n')
tokens: list[Token] = []
state: Optional[Token] = None
buffer = ''
for line in lines:
token: Optional[Token] = None
# Handle multiline macros.
if state:
if state.process_next(line):
tokens.append(state)
state = None
# Determine the relevant macro for the line and create
# the token for that macro.
if state:
pass
elif line.startswith('.EE'):
token = None
elif line.startswith('.EX'):
state = Example()
elif line.startswith('.IP'):
args = line.rstrip().split(' ')
if len(args) == 2:
state = IndentedParagraph(args[1])
elif len(args) > 2:
state = IndentedParagraph(args[1], args[2])
else:
state = IndentedParagraph()
elif line.startswith('.MT'):
args = line.split(' ')
state = EmailAddress(args[1])
elif (
line.startswith('.P')
or line.startswith('.LP')
or line.startswith('.PP')
):
state = Paragraph()
elif line.startswith('.RE'):
args = line.split(' ')
if args[1:]:
token = RelativeIndentEnd(args[1])
else:
token = RelativeIndentEnd()
elif line.startswith('.RS'):
args = line.split(' ')
if args[1:]:
token = RelativeIndentStart(args[1])
else:
token = RelativeIndentStart()
elif line.startswith('.SH'):
args = line.split(' ', 1)
if len(args) > 1:
state = Section(args[1])
else:
state = Section()
elif line.startswith('.SS'):
args = line.split(' ', 1)
if len(args) > 1:
state = Subheading(args[1])
else:
state = Subheading()
elif line.startswith('.SY'):
args = line.split(' ')
state = Synopsis(args[1])
elif line.startswith('.TH'):
args = line.split(' ')
token = Title(*args[1:])
elif line.startswith('.TP'):
args = line.rstrip().split(' ')
if len(args) == 2:
state = TaggedParagraph(args[1])
elif len(args) > 2:
state = TaggedParagraph(args[1], [args[2],])
else:
state = TaggedParagraph()
elif line.startswith('.UR'):
args = line.split(' ')
state = Url(args[1])
elif line.startswith('.'):
token = Empty(line[1:])
elif line:
token = Text(line.rstrip())
# Add the token to the lexed document.
if token:
tokens.append(token)
else:
if state:
tokens.append(state)
state = None
return tuple(tokens)
# Parsing.
def parse(tokens: Sequence[Token], width: Optional[int] = 80) -> str:
"""Parse the tokens into a string."""
text = ''
footer = ''
margin = 0
indent = 4
for token in tokens:
if isinstance(token, Title):
footer = token.footer(width)
parsed, margin, indent = token.parse(width, margin, indent)
text += parsed
if footer:
text = f'{text}{footer}'
return text
# Main line.
[docs]
def to_term(text: str, width: Optional[int] = None) -> str:
"""Convert man-style macros into terminal ready text.
:param text: A :class:`str` with man troff macros.
:param width: (Optional.) The width of the terminal as a
:class:`int`. Defaults to `None`.
:return: The troff macros turned into a string ready to display in
the terminal as a :class:`str`.
:rtype: str
:usage:
To convert troff macros to terminal ready text:
>>> from thurible import man
>>> macro = '.RS 4\\n.P\\nThis paragraph is indented.'
>>> man.to_term(macro)
' This paragraph is indented.\\n\\n'
"""
tokens = lex(text)
return parse(tokens, width)