py-xbc/src/xbc/utils.py

# Copyright (c) 2024 Síle Ekaterin Liszka
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

'''
Utility functions for saving data in XBC format.
'''

import re

from collections.abc import Sequence, Mapping

KEY_RE   = re.compile(r'^[a-zA-Z0-9_-]+(?:\.(?:[a-zA-Z0-9_-]+))*$')
NVAL_RE  = re.compile(r'^[^{}=+:;,\n\'"]+$')
QUOTES   = '\'"'
QUOTEMAP = {'"': "'", "'": '"'}
ESCAPES  = {
    'backslash': {'"': '\\x22', "'": '\\x27'},
    'html': {'"': '&quot;', "'": '&apos;'},
    'url': {'"': '%22', "'": '%27'}
}

def quote(data, escape='backslash'):
    'Quote data according to XBC rules.'
    # don't waste our time if it's a valid bare value.
    if NVAL_RE.match(data) is not None:
        return data
    esc = None

    # how shall we escape embedded quotes?
    if isinstance(esc, Mapping):
        if '"' in escape and "'" in escape:
            esc = escape
    else:
        esc = ESCAPES.get(escape, None)

    if esc is None:
        raise ValueError('unrecognised escape format')

    f = data[0]

    # is this a quoted string?
    if f in QUOTES and data[-1] == f:
        # return it if we don't need to do anything
        if f not in data[1:-1]:
            return data
        # escape embedded quotes
        x = data[1:-1].replace(f, esc[f])
        return f'{f}{x}{f}'

    # if the other quote isn't used, wrap in it
    if f in QUOTES and QUOTEMAP[f] not in data[1:]:
        q = QUOTEMAP[f]
    # not a quoted string, but has only one kind of quote
    elif "'" in data and '"' not in data:
        q = '"'
    elif '"' in data and "'" not in data:
        q = "'"
    else:
        # not a quoted string and has both types; we escape one
        data = data.replace("'", esc["'"])
        q = "'"
    return f'{q}{data}{q}'

def normalise_string(string):
    'Normalise values according to XBC rules.'
    if not isinstance(string, str):
        string = str(string)

    if NVAL_RE.match(string) is None:
        string = quote(string)

    return string

def normalise(data):
    '''Normalise values according to XBC rules.'''
    if isinstance(data, str) or not isinstance(data, (Sequence, Mapping)):
        return normalise_string(data)

    if isinstance(data, Sequence):
        ret = []

        for item in data:
            if isinstance(item, str) or not isinstance(item, (Sequence, Mapping)):
                ret.append(normalise_string(item))
            # we can unwind nested sequences
            elif isinstance(item, Sequence):
                ret.extend(normalise(item))
            # ...but we can't do that with mappings, the format doesn't
            # support it.
            elif isinstance(item, Mapping):
                raise ValueError('nested mapping')
            else:
                # this should be impossible to reach, but nothing truly is.
                raise ValueError(type(item))

        ret = ', '.join(ret)

        return ret

    if isinstance(data, Mapping):
        d = {}
        for k, v in data.items():
            if KEY_RE.match(k) is None:
                raise KeyError(k)

            v = normalise(v)

            d[k] = v

        return d

    return data

__all__ = ['quote', 'normalise', 'normalise_string']