Source code for ftfy

"""
ftfy: fixes text for you

This is a module for making text less broken. See the `fix_text` function
for more information.
"""

import unicodedata
from typing import List, NamedTuple, Optional, Tuple, Union

from ftfy import bad_codecs
from ftfy import chardata, fixes
from ftfy.badness import is_bad
from ftfy.formatting import display_ljust

__version__ = "6.0"


# Though this function does nothing, it lets linters know that we're using
# ftfy.bad_codecs. See the docstring in `bad_codecs/__init__.py` for more.
bad_codecs.ok()


[docs]class ExplainedText(NamedTuple): """ The return type from ftfy's functions that provide an "explanation" of which steps it applied to fix the text, such as :func:`fix_and_explain()`. When the 'explain' option is disabled, these functions return the same type, but the `explanation` will be None. """ text: str explanation: Optional[List[Tuple[str, str]]]
[docs]class TextFixerConfig(NamedTuple): r""" A TextFixerConfig object stores configuration options for ftfy. It's implemented as a namedtuple with defaults, so you can instantiate it by providing the values to change from their defaults as keyword arguments. For example, to disable 'unescape_html' and keep the rest of the defaults:: TextFixerConfig(unescape_html=False) Here are the options and their default values: - `unescape_html`: "auto" Configures whether to replace HTML entities such as &amp; with the character they represent. "auto" says to do this by default, but disable it when a literal < character appears, indicating that the input is actual HTML and entities should be preserved. The value can be True, to always enable this fixer, or False, to always disable it. - `remove_terminal_escapes`: True Removes "ANSI" terminal escapes, such as for changing the color of text in a terminal window. - `fix_encoding`: True Detect mojibake and attempt to fix it by decoding the text in a different encoding standard. The following four options affect `fix_encoding` works, and do nothing if `fix_encoding` is False: - `restore_byte_a0`: True Allow a literal space (U+20) to be interpreted as a non-breaking space (U+A0) when that would make it part of a fixable mojibake string. Because spaces are very common characters, this could lead to false positives, but we try to apply it only when there's strong evidence for mojibake. Disabling `restore_byte_a0` is safer from false positives, but creates false negatives. - `replace_lossy_sequences`: True Detect mojibake that has been partially replaced by the characters '�' or '?'. If the mojibake could be decoded otherwise, replace the detected sequence with '�'. - `decode_inconsistent_utf8`: True When we see sequences that distinctly look like UTF-8 mojibake, but there's no consistent way to reinterpret the string in a new encoding, replace the mojibake with the appropriate UTF-8 characters anyway. This helps to decode strings that are concatenated from different encodings. - `fix_c1_controls`: True Replace C1 control characters (the useless characters U+80 - U+9B that come from Latin-1) with their Windows-1252 equivalents, like HTML5 does, even if the whole string doesn't decode as Latin-1. - `fix_latin_ligatures`: True Replace common Latin-alphabet ligatures, such as ``fi``, with the letters they're made of. - `fix_character_width`: True Replace fullwidth Latin characters and halfwidth Katakana with their more standard widths. - `uncurl_quotes`: True Replace curly quotes with straight quotes. - `fix_line_breaks`: True Replace various forms of line breaks with the standard Unix line break, ``\n``. - `fix_surrogates`: True Replace sequences of UTF-16 surrogate codepoints with the character they were meant to encode. This fixes text that was decoded with the obsolete UCS-2 standard, and allows it to support high-numbered codepoints such as emoji. - `remove_control_chars`: True Remove certain control characters that have no displayed effect on text. - `normalization`: "NFC" Choose what kind of Unicode normalization is applied. Usually, we apply NFC normalization, so that letters followed by combining characters become single combined characters. Changing this to "NFKC" applies more compatibility conversions, such as replacing the 'micro sign' with a standard Greek lowercase mu, which looks identical. However, some NFKC normalizations change the meaning of text, such as converting "10³" to "103". `normalization` can be None, to apply no normalization. - `max_decode_length`: 1_000_000 The maximum size of "segment" that ftfy will try to fix all at once. - `explain`: True Whether to compute 'explanations', lists describing what ftfy changed. When this is False, the explanation will be None, and the code that builds the explanation will be skipped, possibly saving time. Functions that accept TextFixerConfig and don't return an explanation will automatically set `explain` to False. """ unescape_html: Union[str, bool] = "auto" remove_terminal_escapes: bool = True fix_encoding: bool = True restore_byte_a0: bool = True replace_lossy_sequences: bool = True decode_inconsistent_utf8: bool = True fix_c1_controls: bool = True fix_latin_ligatures: bool = True fix_character_width: bool = True uncurl_quotes: bool = True fix_line_breaks: bool = True fix_surrogates: bool = True remove_control_chars: bool = True normalization: Optional[str] = "NFC" max_decode_length: int = 1000000 explain: bool = True
FIXERS = { "unescape_html": fixes.unescape_html, "remove_terminal_escapes": fixes.remove_terminal_escapes, "restore_byte_a0": fixes.restore_byte_a0, "replace_lossy_sequences": fixes.replace_lossy_sequences, "decode_inconsistent_utf8": fixes.decode_inconsistent_utf8, "fix_c1_controls": fixes.fix_c1_controls, "fix_latin_ligatures": fixes.fix_latin_ligatures, "fix_character_width": fixes.fix_character_width, "uncurl_quotes": fixes.uncurl_quotes, "fix_line_breaks": fixes.fix_line_breaks, "fix_surrogates": fixes.fix_surrogates, "remove_control_chars": fixes.remove_control_chars, } BYTES_ERROR_TEXT = """Hey wait, this isn't Unicode. ftfy is designed to fix problems with text. Treating bytes like they're interchangeable with Unicode text is usually something that introduces problems with text. You should first decode these bytes from the encoding you think they're in. If you're not sure what encoding they're in: - First, try to find out. 'utf-8' is a good assumption. - If the encoding is simply unknowable, try running your bytes through ftfy.guess_bytes. As the name implies, this may not always be accurate. For more information on the distinction between bytes and text, read the Python Unicode HOWTO: http://docs.python.org/3/howto/unicode.html """ def _try_fix( fixer_name: str, text: str, config: TextFixerConfig, steps: Optional[list] ) -> str: """ A helper function used across several 'fixer' steps, deciding whether to apply the fix and whether to record the fix in `steps`. """ if getattr(config, fixer_name): fixer = FIXERS[fixer_name] fixed = fixer(text) if steps is not None and fixed != text: steps.append(("apply", fixer_name)) return fixed return text
[docs]def fix_text(text: str, config: Optional[TextFixerConfig] = None, **kwargs) -> str: r""" Given Unicode text as input, fix inconsistencies and glitches in it, such as mojibake (text that was decoded in the wrong encoding). Let's start with some examples: >>> fix_text('✔ No problems') '✔ No problems' >>> print(fix_text("&macr;\\_(ã\x83\x84)_/&macr;")) ¯\_(ツ)_/¯ >>> fix_text('Broken text&hellip; it&#x2019;s flubberific!') "Broken text... it's flubberific!" >>> fix_text('LOUD NOISES') 'LOUD NOISES' ftfy applies a number of different fixes to the text, and can accept configuration to select which fixes to apply. The configuration takes the form of a :class:`TextFixerConfig` object, and you can see a description of the options in that class's docstring or in the full documentation at ftfy.readthedocs.org. For convenience and backward compatibility, the configuration can also take the form of keyword arguments, which will set the equivalently-named fields of the TextFixerConfig object. For example, here are two ways to fix text but skip the "uncurl_quotes" step:: fix_text(text, TextFixerConfig(uncurl_quotes=False)) fix_text(text, uncurl_quotes=False) This function fixes text in independent segments, which are usually lines of text, or arbitrarily broken up every 1 million codepoints (configurable with `config.max_decode_length`) if there aren't enough line breaks. The bound on segment lengths helps to avoid unbounded slowdowns. ftfy can also provide an 'explanation', a list of transformations it applied to the text that would fix more text like it. This function doesn't provide explanations (because there may be different fixes for different segments of text). To get an explanation, use the :func:`fix_and_explain()` function, which fixes the string in one segment and explains what it fixed. """ if config is None: config = TextFixerConfig() config = config._replace(**kwargs) config = config._replace(explain=False) if isinstance(text, bytes): raise UnicodeError(BYTES_ERROR_TEXT) out = [] pos = 0 while pos < len(text): textbreak = text.find("\n", pos) + 1 if textbreak == 0: textbreak = len(text) if (textbreak - pos) > config.max_decode_length: textbreak = pos + config.max_decode_length segment = text[pos:textbreak] if config.unescape_html == "auto" and "<" in segment: config = config._replace(unescape_html=False) fixed_segment, _ = fix_and_explain(segment, config) out.append(fixed_segment) pos = textbreak return "".join(out)
[docs]def fix_and_explain( text: str, config: Optional[TextFixerConfig] = None, **kwargs ) -> ExplainedText: """ Fix text as a single segment, returning the fixed text and an explanation of what was fixed. The explanation is a list of steps that can be applied with :func:`apply_plan`, or if config.explain is False, it will be None. """ if config is None: config = TextFixerConfig() if isinstance(text, bytes): raise UnicodeError(BYTES_ERROR_TEXT) config = config._replace(**kwargs) if config.unescape_html == "auto" and "<" in text: config = config._replace(unescape_html=False) if config.explain: steps: Optional[List[Tuple[str, str]]] = [] else: # If explanations aren't desired, `steps` will be None steps = None while True: origtext = text text = _try_fix("unescape_html", text, config, steps) if config.fix_encoding: if steps is None: text = fix_encoding(text) else: text, encoding_steps = fix_encoding_and_explain(text, config) steps.extend(encoding_steps) for fixer in [ "fix_c1_controls", "fix_latin_ligatures", "fix_character_width", "uncurl_quotes", "fix_line_breaks", "fix_surrogates", "remove_control_chars", ]: text = _try_fix(fixer, text, config, steps) if config.normalization is not None: fixed = unicodedata.normalize(config.normalization, text) if steps is not None and fixed != text: steps.append(("normalize", config.normalization)) text = fixed if text == origtext: return ExplainedText(text, steps)
[docs]def fix_encoding_and_explain( text: str, config: Optional[TextFixerConfig] = None, **kwargs ) -> ExplainedText: """ Apply the steps of ftfy that detect mojibake and fix it. Returns the fixed text and a list explaining what was fixed. This includes fixing text by encoding and decoding it in different encodings, as well as the subordinate fixes `restore_byte_a0`, `replace_lossy_sequences`, `decode_inconsistent_utf8`, and `fix_c1_controls`. Examples:: >>> fix_encoding_and_explain("só") ExplainedText(text='só', explanation=[('encode', 'latin-1'), ('decode', 'utf-8')]) >>> result = fix_encoding_and_explain("voilà le travail") >>> result.text 'voilà le travail' >>> result.explanation [('encode', 'latin-1'), ('transcode', 'restore_byte_a0'), ('decode', 'utf-8')] """ if config is None: config = TextFixerConfig() if isinstance(text, bytes): raise UnicodeError(BYTES_ERROR_TEXT) config = config._replace(**kwargs) if not config.fix_encoding: # A weird trivial case: we're asked to fix the encoding, but skip # fixing the encoding return ExplainedText(text, []) plan_so_far: List[Tuple[str, str]] = [] while True: prevtext = text text, plan = _fix_encoding_one_step_and_explain(text, config) plan_so_far.extend(plan) if text == prevtext: return ExplainedText(text, plan_so_far)
def _fix_encoding_one_step_and_explain( text: str, config: TextFixerConfig ) -> ExplainedText: """ Perform one step of fixing the encoding of text. """ if config is None: config = TextFixerConfig() if len(text) == 0: return ExplainedText(text, []) # The first plan is to return ASCII text unchanged, as well as text # that doesn't look like it contains mojibake if chardata.possible_encoding(text, "ascii") or not is_bad(text): return ExplainedText(text, []) # As we go through the next step, remember the possible encodings # that we encounter but don't successfully fix yet. We may need them # later. possible_1byte_encodings = [] # Suppose the text was supposed to be UTF-8, but it was decoded using # a single-byte encoding instead. When these cases can be fixed, they # are usually the correct thing to do, so try them next. for encoding in chardata.CHARMAP_ENCODINGS: if chardata.possible_encoding(text, encoding): possible_1byte_encodings.append(encoding) encoded_bytes = text.encode(encoding) encode_step = ("encode", encoding) transcode_steps = [] # Now, find out if it's UTF-8 (or close enough). Otherwise, # remember the encoding for later. try: decoding = "utf-8" # Check encoded_bytes for sequences that would be UTF-8, # except they have b' ' where b'\xa0' would belong. if config.restore_byte_a0 and chardata.ALTERED_UTF8_RE.search( encoded_bytes ): replaced_bytes = fixes.restore_byte_a0(encoded_bytes) if replaced_bytes != encoded_bytes: transcode_steps.append(("transcode", "restore_byte_a0")) encoded_bytes = replaced_bytes # Replace sequences where information has been lost if config.replace_lossy_sequences and encoding.startswith("sloppy"): replaced_bytes = fixes.replace_lossy_sequences(encoded_bytes) if replaced_bytes != encoded_bytes: transcode_steps.append(("transcode", "replace_lossy_sequences")) encoded_bytes = replaced_bytes if 0xED in encoded_bytes or 0xC0 in encoded_bytes: decoding = "utf-8-variants" decode_step = ("decode", decoding) steps = [encode_step] + transcode_steps + [decode_step] fixed = encoded_bytes.decode(decoding) return ExplainedText(fixed, steps) except UnicodeDecodeError: pass # Look for a-hat-euro sequences that remain, and fix them in isolation. if config.decode_inconsistent_utf8 and chardata.UTF8_DETECTOR_RE.search(text): steps = [("apply", "decode_inconsistent_utf8")] fixed = fixes.decode_inconsistent_utf8(text) if fixed != text: return ExplainedText(fixed, steps) # The next most likely case is that this is Latin-1 that was intended to # be read as Windows-1252, because those two encodings in particular are # easily confused. if "latin-1" in possible_1byte_encodings: if "windows-1252" in possible_1byte_encodings: # This text is in the intersection of Latin-1 and # Windows-1252, so it's probably legit. return ExplainedText(text, []) else: # Otherwise, it means we have characters that are in Latin-1 but # not in Windows-1252. Those are C1 control characters. Nobody # wants those. Assume they were meant to be Windows-1252. try: fixed = text.encode("latin-1").decode("windows-1252") if fixed != text: steps = [("encode", "latin-1"), ("decode", "windows-1252")] return ExplainedText(fixed, steps) except UnicodeDecodeError: pass # Fix individual characters of Latin-1 with a less satisfying explanation if config.fix_c1_controls and chardata.C1_CONTROL_RE.search(text): steps = [("transcode", "fix_c1_controls")] fixed = fixes.fix_c1_controls(text) return ExplainedText(fixed, steps) # The cases that remain are mixups between two different single-byte # encodings, and not the common case of Latin-1 vs. Windows-1252. # # With the new heuristic in 6.0, it's possible that we're closer to solving # these in some cases. It would require a lot of testing and tuning, though. # For now, we leave the text unchanged in these cases. return ExplainedText(text, [])
[docs]def fix_encoding(text: str, config: TextFixerConfig = None, **kwargs): """ Apply just the encoding-fixing steps of ftfy to this text. Returns the fixed text, discarding the explanation. >>> fix_encoding("ó") 'ó' >>> fix_encoding("&ATILDE;&SUP3;") '&ATILDE;&SUP3;' """ if config is None: config = TextFixerConfig() config = config._replace(**kwargs) fixed, _explan = fix_encoding_and_explain(text, config) return fixed
# Some alternate names for the main functions ftfy = fix_text def fix_text_segment(text: str, config: TextFixerConfig = None, **kwargs): """ Fix text as a single segment, with a consistent sequence of steps that are applied to fix the text. Discard the explanation. """ if config is None: config = TextFixerConfig(explain=False) config = config._replace(**kwargs) fixed, _explan = fix_and_explain(text, config) return fixed def fix_file(input_file, encoding=None, config=None, **kwargs): """ Fix text that is found in a file. If the file is being read as Unicode text, use that. If it's being read as bytes, then we hope an encoding was supplied. If not, unfortunately, we have to guess what encoding it is. We'll try a few common encodings, but we make no promises. See the `guess_bytes` function for how this is done. The output is a stream of fixed lines of text. """ if config is None: config = TextFixerConfig() config = config._replace(**kwargs) for line in input_file: if isinstance(line, bytes): if encoding is None: line, encoding = guess_bytes(line) else: line = line.decode(encoding) if config.unescape_html == "auto" and "<" in line: config = config._replace(unescape_html=False) fixed_line, _explan = fix_and_explain(line, config) yield fixed_line
[docs]def guess_bytes(bstring): """ NOTE: Using `guess_bytes` is not the recommended way of using ftfy. ftfy is not designed to be an encoding detector. In the unfortunate situation that you have some bytes in an unknown encoding, ftfy can guess a reasonable strategy for decoding them, by trying a few common encodings that can be distinguished from each other. Unlike the rest of ftfy, this may not be accurate, and it may *create* Unicode problems instead of solving them! The encodings we try here are: - UTF-16 with a byte order mark, because a UTF-16 byte order mark looks like nothing else - UTF-8, because it's the global standard, which has been used by a majority of the Web since 2008 - "utf-8-variants", or buggy implementations of UTF-8 - MacRoman, because Microsoft Office thinks it's still a thing, and it can be distinguished by its line breaks. (If there are no line breaks in the string, though, you're out of luck.) - "sloppy-windows-1252", the Latin-1-like encoding that is the most common single-byte encoding. """ if isinstance(bstring, str): raise UnicodeError( "This string was already decoded as Unicode. You should pass " "bytes to guess_bytes, not Unicode." ) if bstring.startswith(b"\xfe\xff") or bstring.startswith(b"\xff\xfe"): return bstring.decode("utf-16"), "utf-16" byteset = set(bstring) try: if 0xED in byteset or 0xC0 in byteset: # Byte 0xed can be used to encode a range of codepoints that # are UTF-16 surrogates. UTF-8 does not use UTF-16 surrogates, # so when we see 0xed, it's very likely we're being asked to # decode CESU-8, the variant that encodes UTF-16 surrogates # instead of the original characters themselves. # # This will occasionally trigger on standard UTF-8, as there # are some Korean characters that also use byte 0xed, but that's # not harmful because standard UTF-8 characters will decode the # same way in our 'utf-8-variants' codec. # # Byte 0xc0 is impossible because, numerically, it would only # encode characters lower than U+0040. Those already have # single-byte representations, and UTF-8 requires using the # shortest possible representation. However, Java hides the null # codepoint, U+0000, in a non-standard longer representation -- it # encodes it as 0xc0 0x80 instead of 0x00, guaranteeing that 0x00 # will never appear in the encoded bytes. # # The 'utf-8-variants' decoder can handle both of these cases, as # well as standard UTF-8, at the cost of a bit of speed. return bstring.decode("utf-8-variants"), "utf-8-variants" else: return bstring.decode("utf-8"), "utf-8" except UnicodeDecodeError: pass if 0x0D in byteset and 0x0A not in byteset: # Files that contain CR and not LF are likely to be MacRoman. return bstring.decode("macroman"), "macroman" return bstring.decode("sloppy-windows-1252"), "sloppy-windows-1252"
[docs]def apply_plan(text: str, plan: List[Tuple[str, str]]): """ Apply a plan for fixing the encoding of text. The plan is a list of tuples of the form (operation, arg). `operation` is one of: - `'encode'`: convert a string to bytes, using `arg` as the encoding - `'decode'`: convert bytes to a string, using `arg` as the encoding - `'transcode'`: convert bytes to bytes, using the function named `arg` - `'apply'`: convert a string to a string, using the function named `arg` The functions that can be applied by 'transcode' and 'apply' are specifically those that appear in the dictionary named `FIXERS`. They can also can be imported from the `ftfy.fixes` module. Example:: >>> mojibake = "schön" >>> text, plan = fix_and_explain(mojibake) >>> apply_plan(mojibake, plan) 'schön' """ obj = text for operation, encoding in plan: if operation == "encode": obj = obj.encode(encoding) elif operation == "decode": obj = obj.decode(encoding) elif operation in ("transcode", "apply"): if encoding in FIXERS: obj = FIXERS[encoding](obj) else: raise ValueError("Unknown function to apply: %s" % encoding) else: raise ValueError("Unknown plan step: %s" % operation) return obj
[docs]def explain_unicode(text: str): """ A utility method that's useful for debugging mysterious Unicode. It breaks down a string, showing you for each codepoint its number in hexadecimal, its glyph, its category in the Unicode standard, and its name in the Unicode standard. >>> explain_unicode('(╯°□°)╯︵ ┻━┻') U+0028 ( [Ps] LEFT PARENTHESIS U+256F ╯ [So] BOX DRAWINGS LIGHT ARC UP AND LEFT U+00B0 ° [So] DEGREE SIGN U+25A1 □ [So] WHITE SQUARE U+00B0 ° [So] DEGREE SIGN U+0029 ) [Pe] RIGHT PARENTHESIS U+256F ╯ [So] BOX DRAWINGS LIGHT ARC UP AND LEFT U+FE35 ︵ [Ps] PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS U+0020 [Zs] SPACE U+253B ┻ [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL U+2501 ━ [So] BOX DRAWINGS HEAVY HORIZONTAL U+253B ┻ [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL """ for char in text: if char.isprintable(): display = char else: display = char.encode("unicode-escape").decode("ascii") print( "U+{code:04X} {display} [{category}] {name}".format( display=display_ljust(display, 7), code=ord(char), category=unicodedata.category(char), name=unicodedata.name(char, "<unknown>"), ) )