# -*- coding: utf-8 -*-
"""
babel.messages.jslexer
~~~~~~~~~~~~~~~~~~~~~~
A simple JavaScript 1.5 lexer which is used for the JavaScript
extractor.
:copyright: (c) 2013 by the Babel Team.
:license: BSD, see LICENSE for more details.
"""
from collections import namedtuple
import re
from babel._compat import unichr
operators = sorted([
'+', '-', '*', '%', '!=', '==', '<', '>', '<=', '>=', '=',
'+=', '-=', '*=', '%=', '<<', '>>', '>>>', '<<=', '>>=',
'>>>=', '&', '&=', '|', '|=', '&&', '||', '^', '^=', '(', ')',
'[', ']', '{', '}', '!', '--', '++', '~', ',', ';', '.', ':'
], key=len, reverse=True)
escapes = {'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t'}
name_re = re.compile(r'[\w$_][\w\d$_]*', re.UNICODE)
dotted_name_re = re.compile(r'[\w$_][\w\d$_.]*[\w\d$_.]', re.UNICODE)
division_re = re.compile(r'/=?')
regex_re = re.compile(r'/(?:[^/\\]*(?:\\.[^/\\]*)*)/[a-zA-Z]*', re.DOTALL)
line_re = re.compile(r'(\r\n|\n|\r)')
line_join_re = re.compile(r'\\' + line_re.pattern)
uni_escape_re = re.compile(r'[a-fA-F0-9]{1,4}')
Token = namedtuple('Token', 'type value lineno')
_rules = [
(None, re.compile(r'\s+', re.UNICODE)),
(None, re.compile(r'<!--.*')),
('linecomment', re.compile(r'//.*')),
('multilinecomment', re.compile(r'/\*.*?\*/', re.UNICODE | re.DOTALL)),
('dotted_name', dotted_name_re),
('name', name_re),
('number', re.compile(r'''(
(?:0|[1-9]\d*)
(\.\d+)?
([eE][-+]?\d+)? |
(0x[a-fA-F0-9]+)
)''', re.VERBOSE)),
('jsx_tag', re.compile(r'(?:</?[^>\s]+|/>)', re.I)), # May be mangled in `get_rules`
('operator', re.compile(r'(%s)' % '|'.join(map(re.escape, operators)))),
('template_string', re.compile(r'''`(?:[^`\\]*(?:\\.[^`\\]*)*)`''', re.UNICODE)),
('string', re.compile(r'''(
'(?:[^'\\]*(?:\\.[^'\\]*)*)' |
"(?:[^"\\]*(?:\\.[^"\\]*)*)"
)''', re.VERBOSE | re.DOTALL))
]
def get_rules(jsx, dotted, template_string):
"""
Get a tokenization rule list given the passed syntax options.
Internal to this module.
"""
rules = []
for token_type, rule in _rules:
if not jsx and token_type and 'jsx' in token_type:
continue
if not template_string and token_type == 'template_string':
continue
if token_type == 'dotted_name':
if not dotted:
continue
token_type = 'name'
rules.append((token_type, rule))
return rules
def indicates_division(token):
"""A helper function that helps the tokenizer to decide if the current
token may be followed by a division operator.
"""
if token.type == 'operator':
return token.value in (')', ']', '}', '++', '--')
return token.type in ('name', 'number', 'string', 'regexp')
def unquote_string(string):
"""Unquote a string with JavaScript rules. The string has to start with
string delimiters (``'``, ``"`` or the back-tick/grave accent (for template strings).)
"""
assert string and string[0] == string[-1] and string[0] in '"\'`', \
'string provided is not properly delimited'
string = line_join_re.sub('\\1', string[1:-1])
result = []
add = result.append
pos = 0
while 1:
# scan for the next escape
escape_pos = string.find('\\', pos)
if escape_pos < 0:
break
add(string[pos:escape_pos])
# check which character is escaped
next_char = string[escape_pos + 1]
if next_char in escapes:
add(escapes[next_char])
# unicode escapes. trie to consume up to four characters of
# hexadecimal characters and try to interpret them as unicode
# character point. If there is no such character point, put
# all the consumed characters into the string.
elif next_char in 'uU':
escaped = uni_escape_re.match(string, escape_pos + 2)
if escaped is not None:
escaped_value = escaped.group()
if len(escaped_value) == 4:
try:
add(unichr(int(escaped_value, 16)))
except ValueError:
pass
else:
pos = escape_pos + 6
continue
add(next_char + escaped_value)
pos = escaped.end()
continue
else:
add(next_char)
# bogus escape. Just remove the backslash.
else:
add(next_char)
pos = escape_pos + 2
if pos < len(string):
add(string[pos:])
return u''.join(result)
def tokenize(source, jsx=True, dotted=True, template_string=True):
"""
Tokenize JavaScript/JSX source. Returns a generator of tokens.
:param jsx: Enable (limited) JSX parsing.
:param dotted: Read dotted names as single name token.
:param template_string: Support ES6 template strings
"""
may_divide = False
pos = 0
lineno = 1
end = len(source)
rules = get_rules(jsx=jsx, dotted=dotted, template_string=template_string)
while pos < end:
# handle regular rules first
for token_type, rule in rules:
match = rule.match(source, pos)
if match is not None:
break
# if we don't have a match we don't give up yet, but check for
# division operators or regular expression literals, based on
# the status of `may_divide` which is determined by the last
# processed non-whitespace token using `indicates_division`.
else:
if may_divide:
match = division_re.match(source, pos)
token_type = 'operator'
else:
match = regex_re.match(source, pos)
token_type = 'regexp'
if match is None:
# woops. invalid syntax. jump one char ahead and try again.
pos += 1
continue
token_value = match.group()
if token_type is not None:
token = Token(token_type, token_value, lineno)
may_divide = indicates_division(token)
yield token
lineno += len(line_re.findall(token_value))
pos = match.end()