diff --git a/CHANGES.md b/CHANGES.md index 4ad3ea8..94bbd3d 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,3 +1,9 @@ +## unreleased + +### Added + +- Add locations in the JSON5 parser error messages (@gcluzel, #182) + ## 2.2.1 *2024-06-04* diff --git a/lib/json5/errors.ml b/lib/json5/errors.ml new file mode 100644 index 0000000..b47d7bd --- /dev/null +++ b/lib/json5/errors.ml @@ -0,0 +1,4 @@ +let string_of_position { Lexing.pos_lnum; pos_fname; _ } = + match pos_fname with + | "" -> Printf.sprintf "Line %d" pos_lnum + | fname -> Printf.sprintf "File %s, line %d" fname pos_lnum diff --git a/lib/json5/errors.mli b/lib/json5/errors.mli new file mode 100644 index 0000000..912da08 --- /dev/null +++ b/lib/json5/errors.mli @@ -0,0 +1,4 @@ +val string_of_position : Lexing.position -> string +(** [string_of_position pos] returns a string that contains the line and, if + supplied, the filename of the position in a way that's appropriate to include + in an error message *) diff --git a/lib/json5/lexer.ml b/lib/json5/lexer.ml index 12f66b0..88e1023 100644 --- a/lib/json5/lexer.ml +++ b/lib/json5/lexer.ml @@ -9,7 +9,6 @@ type token = | CLOSE_BRACKET | COLON | COMMA - | COMMENT of string | TRUE | FALSE | NULL @@ -18,25 +17,38 @@ type token = | INT of string | STRING of string | IDENTIFIER_NAME of string - -let pp_token ppf = function - | OPEN_PAREN -> Format.fprintf ppf "'('" - | CLOSE_PAREN -> Format.fprintf ppf "')'" - | OPEN_BRACE -> Format.fprintf ppf "'{'" - | CLOSE_BRACE -> Format.fprintf ppf "'}'" - | OPEN_BRACKET -> Format.fprintf ppf "'['" - | CLOSE_BRACKET -> Format.fprintf ppf "']'" - | COLON -> Format.fprintf ppf "':'" - | COMMA -> Format.fprintf ppf "','" - | COMMENT s -> Format.fprintf ppf "COMMENT '%s'" s - | TRUE -> Format.fprintf ppf "'true'" - | FALSE -> Format.fprintf ppf "'false'" - | NULL -> Format.fprintf ppf "'null'" - | FLOAT s -> Format.fprintf ppf "FLOAT '%s'" s - | INT_OR_FLOAT s -> Format.fprintf ppf "INT_OR_FLOAT '%s'" s - | INT s -> Format.fprintf ppf "INT '%s'" s - | STRING s -> Format.fprintf ppf "STRING '%s'" s - | IDENTIFIER_NAME s -> Format.fprintf ppf "IDENTIFIER_NAME '%s'" s + | EOF + +let pp_token ppf = + let ps = Format.pp_print_string ppf in + let pf = Format.fprintf ppf in + function + | OPEN_PAREN -> ps "'('" + | CLOSE_PAREN -> ps "')'" + | OPEN_BRACE -> ps "'{'" + | CLOSE_BRACE -> ps "'}'" + | OPEN_BRACKET -> ps "'['" + | CLOSE_BRACKET -> ps "']'" + | COLON -> ps "':'" + | COMMA -> ps "','" + | TRUE -> ps "'true'" + | FALSE -> ps "'false'" + | NULL -> ps "'null'" + | FLOAT s -> pf "FLOAT %S" s + | INT_OR_FLOAT s -> pf "INT_OR_STRING %S" s + | INT s -> pf "INT %S" s + | STRING s -> pf "%S" s + | IDENTIFIER_NAME s -> pf "IDENTIFIER_NAME %S" s + | EOF -> ps "EOF" + +let lexer_error lexbuf = + let pos_start, _pos_end = Sedlexing.lexing_positions lexbuf in + let location = Errors.string_of_position pos_start in + let msg = + Printf.sprintf "%s: Unexpected character '%s'" location + (Sedlexing.Utf8.lexeme lexbuf) + in + Error msg let source_character = [%sedlex.regexp? any] let line_terminator = [%sedlex.regexp? 0x000A | 0x000D | 0x2028 | 0x2029] @@ -182,10 +194,7 @@ let string_lex_single lexbuf strbuf = | Sub (source_character, ('\'' | line_terminator)) -> Buffer.add_string strbuf (lexeme lexbuf); lex lexbuf strbuf - | _ -> - lexeme lexbuf - |> Format.sprintf "Unexpected character: %s" - |> Result.error + | _ -> lexer_error lexbuf in lex lexbuf strbuf @@ -202,50 +211,48 @@ let string_lex_double lexbuf strbuf = | Sub (source_character, ('"' | line_terminator)) -> Buffer.add_string strbuf (lexeme lexbuf); lex lexbuf strbuf - | _ -> - lexeme lexbuf - |> Format.sprintf "Unexpected character: %s" - |> Result.error + | _ -> lexer_error lexbuf in lex lexbuf strbuf let string_lex lexbuf quote = let strbuf = Buffer.create 200 in - if quote = "'" then string_lex_single lexbuf strbuf - else if quote = {|"|} then string_lex_double lexbuf strbuf - else Error (Format.sprintf "Invalid string quote %S" quote) + match quote with + | "'" -> string_lex_single lexbuf strbuf + | {|"|} -> string_lex_double lexbuf strbuf + | _ -> Error (Printf.sprintf "Invalid string quote %S" quote) let rec lex tokens buf = let lexeme = Sedlexing.Utf8.lexeme in + let pos, _ = Sedlexing.lexing_positions buf in match%sedlex buf with - | '(' -> lex (OPEN_PAREN :: tokens) buf - | ')' -> lex (CLOSE_PAREN :: tokens) buf - | '{' -> lex (OPEN_BRACE :: tokens) buf - | '}' -> lex (CLOSE_BRACE :: tokens) buf - | '[' -> lex (OPEN_BRACKET :: tokens) buf - | ']' -> lex (CLOSE_BRACKET :: tokens) buf - | ':' -> lex (COLON :: tokens) buf - | ',' -> lex (COMMA :: tokens) buf + | '(' -> lex ((OPEN_PAREN, pos) :: tokens) buf + | ')' -> lex ((CLOSE_PAREN, pos) :: tokens) buf + | '{' -> lex ((OPEN_BRACE, pos) :: tokens) buf + | '}' -> lex ((CLOSE_BRACE, pos) :: tokens) buf + | '[' -> lex ((OPEN_BRACKET, pos) :: tokens) buf + | ']' -> lex ((CLOSE_BRACKET, pos) :: tokens) buf + | ':' -> lex ((COLON, pos) :: tokens) buf + | ',' -> lex ((COMMA, pos) :: tokens) buf | Chars {|"'|} -> let* s = string_lex buf (lexeme buf) in - lex (STRING s :: tokens) buf + lex ((STRING s, pos) :: tokens) buf | multi_line_comment | single_line_comment | white_space | line_terminator -> lex tokens buf - | "true" -> lex (TRUE :: tokens) buf - | "false" -> lex (FALSE :: tokens) buf - | "null" -> lex (NULL :: tokens) buf + | "true" -> lex ((TRUE, pos) :: tokens) buf + | "false" -> lex ((FALSE, pos) :: tokens) buf + | "null" -> lex ((NULL, pos) :: tokens) buf | json5_float -> let s = lexeme buf in - lex (FLOAT s :: tokens) buf + lex ((FLOAT s, pos) :: tokens) buf | json5_int -> let s = lexeme buf in - lex (INT s :: tokens) buf + lex ((INT s, pos) :: tokens) buf | json5_int_or_float -> let s = lexeme buf in - lex (INT_OR_FLOAT s :: tokens) buf + lex ((INT_OR_FLOAT s, pos) :: tokens) buf | identifier_name -> let s = lexeme buf in - lex (IDENTIFIER_NAME s :: tokens) buf - | eof -> Ok (List.rev tokens) - | _ -> - lexeme buf |> Format.asprintf "Unexpected character: '%s'" |> Result.error + lex ((IDENTIFIER_NAME s, pos) :: tokens) buf + | eof -> Ok (List.rev ((EOF, pos) :: tokens)) + | _ -> lexer_error buf diff --git a/lib/json5/parser.ml b/lib/json5/parser.ml index 1f19c82..6ff5d80 100644 --- a/lib/json5/parser.ml +++ b/lib/json5/parser.ml @@ -1,46 +1,69 @@ open Let_syntax.Result +let parser_error pos error = + let location = Errors.string_of_position pos in + let msg = Printf.sprintf "%s: %s" location error in + Error msg + let rec parse_list acc = function - | [] -> Error "List never ends" - | Lexer.CLOSE_BRACKET :: xs -> Ok (acc, xs) + | [] -> Error "Unexpected end of input" + | [ (Lexer.EOF, pos) ] -> parser_error pos "Unexpected end of input" + | (Lexer.CLOSE_BRACKET, _) :: xs -> Ok (acc, xs) | xs -> ( let* v, xs = parse xs in match xs with - | [] -> Error "List was not closed" - | Lexer.CLOSE_BRACKET :: xs | COMMA :: CLOSE_BRACKET :: xs -> + | [] -> Error "Unexpected end of input" + | [ (Lexer.EOF, pos) ] -> parser_error pos "Unexpected end of input" + | (Lexer.CLOSE_BRACKET, _) :: xs | (COMMA, _) :: (CLOSE_BRACKET, _) :: xs + -> Ok (v :: acc, xs) - | COMMA :: xs -> parse_list (v :: acc) xs - | x :: _ -> + | (COMMA, _) :: xs -> parse_list (v :: acc) xs + | (x, pos) :: _ -> let s = Format.asprintf "Unexpected list token: %a" Lexer.pp_token x in - Error s) + parser_error pos s) and parse_assoc acc = function - | [] -> Error "Assoc never ends" - | Lexer.CLOSE_BRACE :: xs -> Ok (acc, xs) - | STRING k :: COLON :: xs | IDENTIFIER_NAME k :: COLON :: xs -> ( - let* v, xs = parse xs in - let item = (k, v) in + | [] -> Error "Unexpected end of input" + | [ (Lexer.EOF, pos) ] -> parser_error pos "Unexpected end of input" + | (CLOSE_BRACE, _) :: xs -> Ok (acc, xs) + | (STRING k, _) :: xs | (IDENTIFIER_NAME k, _) :: xs -> ( match xs with - | [] -> Error "Object was not closed" - | Lexer.CLOSE_BRACE :: xs | COMMA :: CLOSE_BRACE :: xs -> - Ok (item :: acc, xs) - | COMMA :: xs -> parse_assoc (item :: acc) xs - | x :: _ -> + | [] -> Error "Unexpected end of input" + | [ (Lexer.EOF, pos) ] -> parser_error pos "Unexpected end of input" + | (Lexer.COLON, _) :: xs -> ( + let* v, xs = parse xs in + let item = (k, v) in + match xs with + | [] -> Error "Unexpected end of input" + | [ (Lexer.EOF, pos) ] -> parser_error pos "Unexpected end of input" + | (CLOSE_BRACE, _) :: xs | (COMMA, _) :: (CLOSE_BRACE, _) :: xs -> + Ok (item :: acc, xs) + | (COMMA, _) :: xs -> parse_assoc (item :: acc) xs + | (x, pos) :: _ -> + let s = + Format.asprintf "Unexpected assoc list token: %a" Lexer.pp_token + x + in + parser_error pos s) + | (x, pos) :: _ -> let s = - Format.asprintf "Unexpected assoc list token: %a" Lexer.pp_token x + Format.asprintf "Expected %a but found %a" Lexer.pp_token + Lexer.COLON Lexer.pp_token x in - Error s) - | x :: _ -> + parser_error pos s) + | (x, pos) :: _ -> let s = - Format.asprintf "Unexpected assoc list token: %a" Lexer.pp_token x + Format.asprintf "Expected string or identifier but found %a" + Lexer.pp_token x in - Error s + parser_error pos s and parse = function - | [] -> Error "empty list of tokens" - | token :: xs -> ( + | [] -> Error "Unexpected end of input" + | [ (Lexer.EOF, pos) ] -> parser_error pos "Unexpected end of input" + | (token, pos) :: xs -> ( match token with | TRUE -> Ok (Ast.Bool true, xs) | FALSE -> Ok (Bool false, xs) @@ -57,12 +80,10 @@ and parse = function (Ast.Assoc (List.rev a), xs) | x -> let s = Format.asprintf "Unexpected token: %a" Lexer.pp_token x in - Error s) + parser_error pos s) -let parse_from_lexbuf ?fname ?lnum lexbuffer = - let fname = Option.value fname ~default:"" in +let parse_from_lexbuf ?(fname = "") ?(lnum = 1) lexbuffer = Sedlexing.set_filename lexbuffer fname; - let lnum = Option.value lnum ~default:1 in let pos = { Lexing.pos_fname = fname; pos_lnum = lnum; pos_bol = 0; pos_cnum = 0 } in diff --git a/test_json5/test.ml b/test_json5/test.ml index 354c04e..9ae7f26 100644 --- a/test_json5/test.ml +++ b/test_json5/test.ml @@ -2,19 +2,23 @@ module M = Yojson_five.Safe let yojson = Alcotest.testable M.pp M.equal -let parsing_test_case name expected input = +(* any error message will match the string. *) +let any_string = Alcotest.testable Fmt.string (fun _ _ -> true) + +let parsing_test_case name error_msg expected input = Alcotest.test_case name `Quick (fun () -> - (* any error message will do *) - let any_string = Alcotest.testable Fmt.string (fun _ _ -> true) in - Alcotest.(check (result yojson any_string)) + Alcotest.(check (result yojson error_msg)) name expected (M.from_string input)) let parsing_should_succeed name input expected = - parsing_test_case name (Ok expected) input + parsing_test_case name Alcotest.string (Ok expected) input let parsing_should_fail name input = let failure = Error "" in - parsing_test_case name failure input + parsing_test_case name any_string failure input + +let parsing_should_fail_with_error name input expected = + parsing_test_case name Alcotest.string (Error expected) input let parsing_tests = [ @@ -101,6 +105,16 @@ No \\n's!", "backwardsCompatible": "with JSON", }|} expected); + parsing_should_fail_with_error "unexpected EOF in list" "[1, 2," + "Line 1: Unexpected end of input"; + parsing_should_fail_with_error "unexpected EOF on different line" "\n[1, 2," + "Line 2: Unexpected end of input"; + parsing_should_fail_with_error "unexpected EOF in assoc" {|{"foo": 1,|} + "Line 1: Unexpected end of input"; + parsing_should_fail_with_error "missing colon in assoc" {|{"foo"}|} + "Line 1: Expected ':' but found '}'"; + parsing_should_fail_with_error "bad identifier in assoc" {|{[0]}|} + "Line 1: Expected string or identifier but found '['"; ] let writing_test_case name input expected =