From 52d21d254913c27673722bc6a23531e59bff558c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C3=ABl=20Blais=20Masson?= Date: Sat, 20 Jan 2018 16:05:03 -0500 Subject: [PATCH 1/4] Add :max_parse_errors argument to .parse, .get and .fragment Default value is 100. This avoids potentially extreme memory usage when parsing a document with a huge number of HTML errors. --- ext/nokogumboc/nokogumbo.c | 11 ++++++----- lib/nokogumbo.rb | 14 +++++++------- test-nokogumbo.rb | 28 +++++++++++++++++++++++++++- 3 files changed, 40 insertions(+), 13 deletions(-) diff --git a/ext/nokogumboc/nokogumbo.c b/ext/nokogumboc/nokogumbo.c index a43c34a8..3256d6a3 100644 --- a/ext/nokogumboc/nokogumbo.c +++ b/ext/nokogumboc/nokogumbo.c @@ -184,7 +184,8 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboNode *node) { } // Parse a string using gumbo_parse into a Nokogiri document -static VALUE parse(VALUE self, VALUE string) { +static VALUE parse(VALUE self, VALUE string, VALUE max_parse_errors) { + int max_errors = NUM2INT(max_parse_errors); const GumboOptions *options = &kGumboDefaultOptions; const char *input = RSTRING_PTR(string); size_t input_len = RSTRING_LEN(string); @@ -217,14 +218,14 @@ static VALUE parse(VALUE self, VALUE string) { VALUE rdoc = Nokogiri_wrap_xml_document(Document, doc); // Add parse errors to rdoc. - if (output->errors.length) { + if (output->errors.length && max_errors > 0) { GumboVector *errors = &output->errors; GumboParser parser = { ._options = options }; GumboStringBuffer msg; - VALUE rerrors = rb_ary_new2(errors->length); + VALUE rerrors = rb_ary_new2(errors->length < max_errors ? errors->length : max_errors); gumbo_string_buffer_init(&parser, &msg); - for (int i=0; i < errors->length; i++) { + for (int i=0; i < errors->length && i < max_errors; i++) { GumboError *err = errors->data[i]; gumbo_string_buffer_clear(&parser, &msg); // Work around bug in gumbo_caret_diagnostic_to_string. @@ -288,5 +289,5 @@ void Init_nokogumboc() { // define Nokogumbo class with a singleton parse method VALUE Gumbo = rb_define_class("Nokogumbo", rb_cObject); - rb_define_singleton_method(Gumbo, "parse", parse, 1); + rb_define_singleton_method(Gumbo, "parse", parse, 2); } diff --git a/lib/nokogumbo.rb b/lib/nokogumbo.rb index 92f2768b..d2ef252b 100644 --- a/lib/nokogumbo.rb +++ b/lib/nokogumbo.rb @@ -4,14 +4,14 @@ module Nokogiri # Parse an HTML document. +string+ contains the document. +string+ # may also be an IO-like object. Returns a +Nokogiri::HTML::Document+. - def self.HTML5(string) - Nokogiri::HTML5.parse(string) + def self.HTML5(*args) + Nokogiri::HTML5.parse(*args) end module HTML5 # Parse an HTML document. +string+ contains the document. +string+ # may also be an IO-like object. Returns a +Nokogiri::HTML::Document+. - def self.parse(string) + def self.parse(string, options={}) if string.respond_to? :read string = string.read end @@ -21,7 +21,7 @@ def self.parse(string) string = reencode(string) end - Nokogumbo.parse(string.to_s) + Nokogumbo.parse(string.to_s, options[:max_parse_errors] || 100) end # Fetch and parse a HTML document from the web, following redirects, @@ -67,7 +67,7 @@ def self.get(uri, options={}) case response when Net::HTTPSuccess - doc = parse(reencode(response.body, response['content-type'])) + doc = parse(reencode(response.body, response['content-type']), options) doc.instance_variable_set('@response', response) doc.class.send(:attr_reader, :response) doc @@ -83,8 +83,8 @@ def self.get(uri, options={}) # while fragment is on the Gumbo TODO list, simulate it by doing # a full document parse and ignoring the parent , , and # tags, and collecting up the children of each. - def self.fragment(string) - doc = parse(string) + def self.fragment(*args) + doc = parse(*args) fragment = Nokogiri::HTML::DocumentFragment.new(doc) if doc.children.length != 1 or doc.children.first.name != 'html' diff --git a/test-nokogumbo.rb b/test-nokogumbo.rb index a8f6eb85..bebf9fc4 100644 --- a/test-nokogumbo.rb +++ b/test-nokogumbo.rb @@ -78,7 +78,7 @@ def test_bogus_encoding end def test_html5_doctype - doc = Nokogumbo.parse("") + doc = Nokogiri::HTML5.parse("") assert_match //, doc.to_html end @@ -132,11 +132,37 @@ def test_parse_errors assert_empty doc.errors end + def test_max_parse_errors + # This document contains 2 parse errors, but we force limit to 1. + doc = Nokogiri::HTML5("", max_parse_errors: 1) + assert_equal 1, doc.errors.length + doc = Nokogiri::HTML5("", max_parse_errors: 1) + assert_empty doc.errors + end + + def test_default_max_parse_errors + # This document contains 200 parse errors, but default limit is 100. + doc = Nokogiri::HTML5("" + "

" * 200) + assert_equal 100, doc.errors.length + end + def test_parse_fragment_errors doc = Nokogiri::HTML5.fragment("<\r\n") refute_empty doc.errors end + def test_fragment_max_parse_errors + # This fragment contains 3 parse errors, but we force limit to 1. + doc = Nokogiri::HTML5.fragment("", max_parse_errors: 1) + assert_equal 1, doc.errors.length + end + + def test_fragment_default_max_parse_errors + # This fragment contains 201 parse errors, but default limit is 100. + doc = Nokogiri::HTML5.fragment("

" * 200) + assert_equal 100, doc.errors.length + end + private def buffer From e99c3e5902f2f9e0223128483b0959226e83c4b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C3=ABl=20Blais=20Masson?= Date: Sat, 20 Jan 2018 16:52:07 -0500 Subject: [PATCH 2/4] Make :max_parse_errors default to 0 This makes Gumbo parse error tracking opt-in for users wanting it. --- lib/nokogumbo.rb | 2 +- test-nokogumbo.rb | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/lib/nokogumbo.rb b/lib/nokogumbo.rb index d2ef252b..c03cdb1b 100644 --- a/lib/nokogumbo.rb +++ b/lib/nokogumbo.rb @@ -21,7 +21,7 @@ def self.parse(string, options={}) string = reencode(string) end - Nokogumbo.parse(string.to_s, options[:max_parse_errors] || 100) + Nokogumbo.parse(string.to_s, options[:max_parse_errors] || 0) end # Fetch and parse a HTML document from the web, following redirects, diff --git a/test-nokogumbo.rb b/test-nokogumbo.rb index bebf9fc4..62091471 100644 --- a/test-nokogumbo.rb +++ b/test-nokogumbo.rb @@ -126,9 +126,9 @@ def test_root_comments end def test_parse_errors - doc = Nokogiri::HTML5("") + doc = Nokogiri::HTML5("", max_parse_errors: 10) assert_equal doc.errors.length, 2 - doc = Nokogiri::HTML5("") + doc = Nokogiri::HTML5("", max_parse_errors: 10) assert_empty doc.errors end @@ -141,13 +141,13 @@ def test_max_parse_errors end def test_default_max_parse_errors - # This document contains 200 parse errors, but default limit is 100. + # This document contains 200 parse errors, but default limit is 0. doc = Nokogiri::HTML5("" + "

" * 200) - assert_equal 100, doc.errors.length + assert_equal 0, doc.errors.length end def test_parse_fragment_errors - doc = Nokogiri::HTML5.fragment("<\r\n") + doc = Nokogiri::HTML5.fragment("<\r\n", max_parse_errors: 10) refute_empty doc.errors end @@ -158,9 +158,9 @@ def test_fragment_max_parse_errors end def test_fragment_default_max_parse_errors - # This fragment contains 201 parse errors, but default limit is 100. + # This fragment contains 201 parse errors, but default limit is 0. doc = Nokogiri::HTML5.fragment("

" * 200) - assert_equal 100, doc.errors.length + assert_equal 0, doc.errors.length end private From 253b779f1670c3b2f9f3db2f125f2ccf7c77fd83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C3=ABl=20Blais=20Masson?= Date: Sat, 20 Jan 2018 16:59:55 -0500 Subject: [PATCH 3/4] Check for `max_errors > 0` first This check will be falsy most of the time now that 0 is the default. --- ext/nokogumboc/nokogumbo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/nokogumboc/nokogumbo.c b/ext/nokogumboc/nokogumbo.c index 3256d6a3..030bd914 100644 --- a/ext/nokogumboc/nokogumbo.c +++ b/ext/nokogumboc/nokogumbo.c @@ -218,7 +218,7 @@ static VALUE parse(VALUE self, VALUE string, VALUE max_parse_errors) { VALUE rdoc = Nokogiri_wrap_xml_document(Document, doc); // Add parse errors to rdoc. - if (output->errors.length && max_errors > 0) { + if (max_errors > 0 && output->errors.length) { GumboVector *errors = &output->errors; GumboParser parser = { ._options = options }; GumboStringBuffer msg; From c7397aab334fa1a3a3670b02f33daca04ee90524 Mon Sep 17 00:00:00 2001 From: Rafael Masson Date: Wed, 24 Jan 2018 16:44:16 -0500 Subject: [PATCH 4/4] =?UTF-8?q?Use=20Gumbo=E2=80=99s=20built-in=20`max=5Fe?= =?UTF-8?q?rrors`=20option?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ext/nokogumboc/nokogumbo.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/ext/nokogumboc/nokogumbo.c b/ext/nokogumboc/nokogumbo.c index 030bd914..46523f37 100644 --- a/ext/nokogumboc/nokogumbo.c +++ b/ext/nokogumboc/nokogumbo.c @@ -185,11 +185,13 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboNode *node) { // Parse a string using gumbo_parse into a Nokogiri document static VALUE parse(VALUE self, VALUE string, VALUE max_parse_errors) { - int max_errors = NUM2INT(max_parse_errors); - const GumboOptions *options = &kGumboDefaultOptions; + GumboOptions options; + memcpy(&options, &kGumboDefaultOptions, sizeof options); + options.max_errors = NUM2INT(max_parse_errors); + const char *input = RSTRING_PTR(string); size_t input_len = RSTRING_LEN(string); - GumboOutput *output = gumbo_parse_with_options(options, input, input_len); + GumboOutput *output = gumbo_parse_with_options(&options, input, input_len); xmlDocPtr doc = xmlNewDoc(CONST_CAST "1.0"); #ifdef NGLIB doc->type = XML_HTML_DOCUMENT_NODE; @@ -218,14 +220,14 @@ static VALUE parse(VALUE self, VALUE string, VALUE max_parse_errors) { VALUE rdoc = Nokogiri_wrap_xml_document(Document, doc); // Add parse errors to rdoc. - if (max_errors > 0 && output->errors.length) { + if (output->errors.length) { GumboVector *errors = &output->errors; - GumboParser parser = { ._options = options }; + GumboParser parser = { ._options = &options }; GumboStringBuffer msg; - VALUE rerrors = rb_ary_new2(errors->length < max_errors ? errors->length : max_errors); + VALUE rerrors = rb_ary_new2(errors->length); gumbo_string_buffer_init(&parser, &msg); - for (int i=0; i < errors->length && i < max_errors; i++) { + for (int i=0; i < errors->length; i++) { GumboError *err = errors->data[i]; gumbo_string_buffer_clear(&parser, &msg); // Work around bug in gumbo_caret_diagnostic_to_string. @@ -254,7 +256,7 @@ static VALUE parse(VALUE self, VALUE string, VALUE max_parse_errors) { gumbo_string_buffer_destroy(&parser, &msg); } - gumbo_destroy_output(options, output); + gumbo_destroy_output(&options, output); return rdoc; }