diff --git a/py/lexer.c b/py/lexer.c index 12cb5ae5b..97c84cf11 100644 --- a/py/lexer.c +++ b/py/lexer.c @@ -112,12 +112,11 @@ STATIC bool is_following_odigit(mp_lexer_t *lex) { return lex->chr1 >= '0' && lex->chr1 <= '7'; } -// TODO UNICODE include unicode characters in definition of identifiers +// to easily parse utf-8 identifiers we allow any raw byte with high bit set STATIC bool is_head_of_identifier(mp_lexer_t *lex) { - return is_letter(lex) || lex->chr0 == '_'; + return is_letter(lex) || lex->chr0 == '_' || lex->chr0 >= 0x80; } -// TODO UNICODE include unicode characters in definition of identifiers STATIC bool is_tail_of_identifier(mp_lexer_t *lex) { return is_head_of_identifier(lex) || is_digit(lex); } @@ -523,13 +522,13 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, bool first_token) { } else if (is_head_of_identifier(lex)) { lex->tok_kind = MP_TOKEN_NAME; - // get first char - vstr_add_char(&lex->vstr, CUR_CHAR(lex)); + // get first char (add as byte to remain 8-bit clean and support utf-8) + vstr_add_byte(&lex->vstr, CUR_CHAR(lex)); next_char(lex); // get tail chars while (!is_end(lex) && is_tail_of_identifier(lex)) { - vstr_add_char(&lex->vstr, CUR_CHAR(lex)); + vstr_add_byte(&lex->vstr, CUR_CHAR(lex)); next_char(lex); } diff --git a/tests/unicode/unicode_id.py b/tests/unicode/unicode_id.py new file mode 100644 index 000000000..10f540c50 --- /dev/null +++ b/tests/unicode/unicode_id.py @@ -0,0 +1,27 @@ +# test unicode in identifiers + +# comment +# αβγδϵφζ + +# global identifiers +α = 1 +αβγ = 2 +bβ = 3 +βb = 4 +print(α, αβγ, bβ, βb) + +# function, argument, local identifiers +def α(β, γ): + δ = β + γ + print(β, γ, δ) +α(1, 2) + +# class, method identifiers +class φ: + def __init__(self): + pass + def δ(self, ϵ): + print(ϵ) +zζzζz = φ() +if hasattr(zζzζz, "δ"): + zζzζz.δ(ϵ=123)