diff options
author | Paul J. Davis <paul.joseph.davis@gmail.com> | 2020-04-21 15:48:16 -0500 |
---|---|---|
committer | Joan Touzet <wohali@users.noreply.github.com> | 2020-04-27 12:45:33 -0400 |
commit | bb43a697fedaac44c9ff4a56e9461d99341cd297 (patch) | |
tree | 21217f2920d72b8eb77164e8c7f92cd1e5da5977 | |
parent | e5239b79449f3da9daad9f297b081498f5670004 (diff) | |
download | couchdb-bb43a697fedaac44c9ff4a56e9461d99341cd297.tar.gz |
Replace broken u-escape sequences
-rw-r--r-- | src/couch/src/couch_query_servers.erl | 106 |
1 files changed, 104 insertions, 2 deletions
diff --git a/src/couch/src/couch_query_servers.erl b/src/couch/src/couch_query_servers.erl index c6d255f17..9842177d3 100644 --- a/src/couch/src/couch_query_servers.erl +++ b/src/couch/src/couch_query_servers.erl @@ -519,7 +519,7 @@ with_ddoc_proc(#doc{id=DDocId,revs={Start, [DiskRev|_]}}=DDoc, Fun) -> proc_prompt(Proc, Args) -> case proc_prompt_raw(Proc, Args) of {json, Json} -> - ?JSON_DECODE(Json); + raw_to_ejson({json, Json}); EJson -> EJson end. @@ -528,10 +528,76 @@ proc_prompt_raw(#proc{prompt_fun = {Mod, Func}} = Proc, Args) -> apply(Mod, Func, [Proc#proc.pid, Args]). raw_to_ejson({json, Json}) -> - ?JSON_DECODE(Json); + try + ?JSON_DECODE(Json) + catch throw:{invalid_json, {_, invalid_string}} -> + Forced = try + force_utf8(Json) + catch _:_ -> + Json + end, + ?JSON_DECODE(Forced) + end; raw_to_ejson(EJson) -> EJson. +force_utf8(Bin) -> + case binary:match(Bin, <<"\\u">>) of + {Start, 2} -> + <<Prefix:Start/binary, Rest1/binary>> = Bin, + {Insert, Rest3} = case check_uescape(Rest1) of + {ok, Skip} -> + <<Skipped:Skip/binary, Rest2/binary>> = Rest1, + {Skipped, Rest2}; + {error, Skip} -> + <<_:Skip/binary, Rest2/binary>> = Rest1, + {<<16#EF, 16#BF, 16#BD>>, Rest2} + end, + RestForced = force_utf8(Rest3), + <<Prefix/binary, Insert/binary, RestForced/binary>>; + nomatch -> + Bin + end. + +check_uescape(Data) -> + case extract_uescape(Data) of + {Hi, Rest} when Hi >= 16#D800, Hi < 16#DC00 -> + case extract_uescape(Rest) of + {Lo, _} when Lo >= 16#DC00, Lo =< 16#DFFF -> + % A low surrogate pair + UTF16 = << + Hi:16/big-unsigned-integer, + Lo:16/big-unsigned-integer + >>, + try + [_] = xmerl_ucs:from_utf16be(UTF16), + {ok, 12} + catch _:_ -> + {error, 6} + end; + {_, _} -> + % Found a uescape that's not a low half + {error, 6}; + false -> + % No hex escape found + {error, 6} + end; + {Hi, _} when Hi >= 16#DC00, Hi =< 16#DFFF -> + % Found a low surrogate half without a high half + {error, 6}; + {_, _} -> + % Found a uescape we don't care about + {ok, 6}; + false -> + % Incomplete uescape which we don't care about + {ok, 2} + end. + +extract_uescape(<<"\\u", Code:4/binary, Rest/binary>>) -> + {binary_to_integer(Code, 16), Rest}; +extract_uescape(_) -> + false. + proc_stop(Proc) -> {Mod, Func} = Proc#proc.stop_fun, apply(Mod, Func, [Proc#proc.pid]). @@ -680,4 +746,40 @@ test_reduce(Reducer, KVs) -> {ok, Finalized} = finalize(Reducer, Reduced), Finalized. +force_utf8_test() -> + % "\uDCA5\uD83D" + Ok = [ + <<"foo">>, + <<"\\u00A0">>, + <<"\\u0032">>, + <<"\\uD83D\\uDCA5">>, + <<"foo\\uD83D\\uDCA5bar">>, + % Truncated but we doesn't break replacements + <<"\\u0FA">> + ], + lists:foreach(fun(Case) -> + ?assertEqual(Case, force_utf8(Case)) + end, Ok), + + NotOk = [ + <<"\\uDCA5">>, + <<"\\uD83D">>, + <<"fo\\uDCA5bar">>, + <<"foo\\uD83Dbar">>, + <<"\\uDCA5\\uD83D">>, + <<"\\uD83Df\\uDCA5">>, + <<"\\uDCA5\\u00A0">>, + <<"\\uD83D\\u00A0">> + ], + ToJSON = fun(Bin) -> <<34, Bin/binary, 34>> end, + lists:foreach(fun(Case) -> + try + ?assertNotEqual(Case, force_utf8(Case)), + ?assertThrow(_, ?JSON_DECODE(ToJSON(Case))), + ?assertMatch(<<_/binary>>, ?JSON_DECODE(ToJSON(force_utf8(Case)))) + catch T:R:S -> + io:format(standard_error, "~p~n~p~n~p~n", [T, R, S]) + end + end, NotOk). + -endif. |