From 52d21d254913c27673722bc6a23531e59bff558c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rafa=C3=ABl=20Blais=20Masson?=
Date: Sat, 20 Jan 2018 16:05:03 -0500
Subject: [PATCH 1/4] Add :max_parse_errors argument to .parse, .get and
.fragment
Default value is 100. This avoids potentially extreme memory usage when parsing a document with a huge number of HTML errors.
---
ext/nokogumboc/nokogumbo.c | 11 ++++++-----
lib/nokogumbo.rb | 14 +++++++-------
test-nokogumbo.rb | 28 +++++++++++++++++++++++++++-
3 files changed, 40 insertions(+), 13 deletions(-)
diff --git a/ext/nokogumboc/nokogumbo.c b/ext/nokogumboc/nokogumbo.c
index a43c34a8..3256d6a3 100644
--- a/ext/nokogumboc/nokogumbo.c
+++ b/ext/nokogumboc/nokogumbo.c
@@ -184,7 +184,8 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboNode *node) {
}
// Parse a string using gumbo_parse into a Nokogiri document
-static VALUE parse(VALUE self, VALUE string) {
+static VALUE parse(VALUE self, VALUE string, VALUE max_parse_errors) {
+ int max_errors = NUM2INT(max_parse_errors);
const GumboOptions *options = &kGumboDefaultOptions;
const char *input = RSTRING_PTR(string);
size_t input_len = RSTRING_LEN(string);
@@ -217,14 +218,14 @@ static VALUE parse(VALUE self, VALUE string) {
VALUE rdoc = Nokogiri_wrap_xml_document(Document, doc);
// Add parse errors to rdoc.
- if (output->errors.length) {
+ if (output->errors.length && max_errors > 0) {
GumboVector *errors = &output->errors;
GumboParser parser = { ._options = options };
GumboStringBuffer msg;
- VALUE rerrors = rb_ary_new2(errors->length);
+ VALUE rerrors = rb_ary_new2(errors->length < max_errors ? errors->length : max_errors);
gumbo_string_buffer_init(&parser, &msg);
- for (int i=0; i < errors->length; i++) {
+ for (int i=0; i < errors->length && i < max_errors; i++) {
GumboError *err = errors->data[i];
gumbo_string_buffer_clear(&parser, &msg);
// Work around bug in gumbo_caret_diagnostic_to_string.
@@ -288,5 +289,5 @@ void Init_nokogumboc() {
// define Nokogumbo class with a singleton parse method
VALUE Gumbo = rb_define_class("Nokogumbo", rb_cObject);
- rb_define_singleton_method(Gumbo, "parse", parse, 1);
+ rb_define_singleton_method(Gumbo, "parse", parse, 2);
}
diff --git a/lib/nokogumbo.rb b/lib/nokogumbo.rb
index 92f2768b..d2ef252b 100644
--- a/lib/nokogumbo.rb
+++ b/lib/nokogumbo.rb
@@ -4,14 +4,14 @@
module Nokogiri
# Parse an HTML document. +string+ contains the document. +string+
# may also be an IO-like object. Returns a +Nokogiri::HTML::Document+.
- def self.HTML5(string)
- Nokogiri::HTML5.parse(string)
+ def self.HTML5(*args)
+ Nokogiri::HTML5.parse(*args)
end
module HTML5
# Parse an HTML document. +string+ contains the document. +string+
# may also be an IO-like object. Returns a +Nokogiri::HTML::Document+.
- def self.parse(string)
+ def self.parse(string, options={})
if string.respond_to? :read
string = string.read
end
@@ -21,7 +21,7 @@ def self.parse(string)
string = reencode(string)
end
- Nokogumbo.parse(string.to_s)
+ Nokogumbo.parse(string.to_s, options[:max_parse_errors] || 100)
end
# Fetch and parse a HTML document from the web, following redirects,
@@ -67,7 +67,7 @@ def self.get(uri, options={})
case response
when Net::HTTPSuccess
- doc = parse(reencode(response.body, response['content-type']))
+ doc = parse(reencode(response.body, response['content-type']), options)
doc.instance_variable_set('@response', response)
doc.class.send(:attr_reader, :response)
doc
@@ -83,8 +83,8 @@ def self.get(uri, options={})
# while fragment is on the Gumbo TODO list, simulate it by doing
# a full document parse and ignoring the parent , , and
# tags, and collecting up the children of each.
- def self.fragment(string)
- doc = parse(string)
+ def self.fragment(*args)
+ doc = parse(*args)
fragment = Nokogiri::HTML::DocumentFragment.new(doc)
if doc.children.length != 1 or doc.children.first.name != 'html'
diff --git a/test-nokogumbo.rb b/test-nokogumbo.rb
index a8f6eb85..bebf9fc4 100644
--- a/test-nokogumbo.rb
+++ b/test-nokogumbo.rb
@@ -78,7 +78,7 @@ def test_bogus_encoding
end
def test_html5_doctype
- doc = Nokogumbo.parse("")
+ doc = Nokogiri::HTML5.parse("")
assert_match //, doc.to_html
end
@@ -132,11 +132,37 @@ def test_parse_errors
assert_empty doc.errors
end
+ def test_max_parse_errors
+ # This document contains 2 parse errors, but we force limit to 1.
+ doc = Nokogiri::HTML5("", max_parse_errors: 1)
+ assert_equal 1, doc.errors.length
+ doc = Nokogiri::HTML5("", max_parse_errors: 1)
+ assert_empty doc.errors
+ end
+
+ def test_default_max_parse_errors
+ # This document contains 200 parse errors, but default limit is 100.
+ doc = Nokogiri::HTML5("" + "
" * 200)
+ assert_equal 100, doc.errors.length
+ end
+
def test_parse_fragment_errors
doc = Nokogiri::HTML5.fragment("<\r\n")
refute_empty doc.errors
end
+ def test_fragment_max_parse_errors
+ # This fragment contains 3 parse errors, but we force limit to 1.
+ doc = Nokogiri::HTML5.fragment("", max_parse_errors: 1)
+ assert_equal 1, doc.errors.length
+ end
+
+ def test_fragment_default_max_parse_errors
+ # This fragment contains 201 parse errors, but default limit is 100.
+ doc = Nokogiri::HTML5.fragment("" * 200)
+ assert_equal 100, doc.errors.length
+ end
+
private
def buffer
From e99c3e5902f2f9e0223128483b0959226e83c4b5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rafa=C3=ABl=20Blais=20Masson?=
Date: Sat, 20 Jan 2018 16:52:07 -0500
Subject: [PATCH 2/4] Make :max_parse_errors default to 0
This makes Gumbo parse error tracking opt-in for users wanting it.
---
lib/nokogumbo.rb | 2 +-
test-nokogumbo.rb | 14 +++++++-------
2 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/lib/nokogumbo.rb b/lib/nokogumbo.rb
index d2ef252b..c03cdb1b 100644
--- a/lib/nokogumbo.rb
+++ b/lib/nokogumbo.rb
@@ -21,7 +21,7 @@ def self.parse(string, options={})
string = reencode(string)
end
- Nokogumbo.parse(string.to_s, options[:max_parse_errors] || 100)
+ Nokogumbo.parse(string.to_s, options[:max_parse_errors] || 0)
end
# Fetch and parse a HTML document from the web, following redirects,
diff --git a/test-nokogumbo.rb b/test-nokogumbo.rb
index bebf9fc4..62091471 100644
--- a/test-nokogumbo.rb
+++ b/test-nokogumbo.rb
@@ -126,9 +126,9 @@ def test_root_comments
end
def test_parse_errors
- doc = Nokogiri::HTML5("")
+ doc = Nokogiri::HTML5("", max_parse_errors: 10)
assert_equal doc.errors.length, 2
- doc = Nokogiri::HTML5("")
+ doc = Nokogiri::HTML5("", max_parse_errors: 10)
assert_empty doc.errors
end
@@ -141,13 +141,13 @@ def test_max_parse_errors
end
def test_default_max_parse_errors
- # This document contains 200 parse errors, but default limit is 100.
+ # This document contains 200 parse errors, but default limit is 0.
doc = Nokogiri::HTML5("" + "" * 200)
- assert_equal 100, doc.errors.length
+ assert_equal 0, doc.errors.length
end
def test_parse_fragment_errors
- doc = Nokogiri::HTML5.fragment("<\r\n")
+ doc = Nokogiri::HTML5.fragment("<\r\n", max_parse_errors: 10)
refute_empty doc.errors
end
@@ -158,9 +158,9 @@ def test_fragment_max_parse_errors
end
def test_fragment_default_max_parse_errors
- # This fragment contains 201 parse errors, but default limit is 100.
+ # This fragment contains 201 parse errors, but default limit is 0.
doc = Nokogiri::HTML5.fragment("" * 200)
- assert_equal 100, doc.errors.length
+ assert_equal 0, doc.errors.length
end
private
From 253b779f1670c3b2f9f3db2f125f2ccf7c77fd83 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rafa=C3=ABl=20Blais=20Masson?=
Date: Sat, 20 Jan 2018 16:59:55 -0500
Subject: [PATCH 3/4] Check for `max_errors > 0` first
This check will be falsy most of the time now that 0 is the default.
---
ext/nokogumboc/nokogumbo.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/ext/nokogumboc/nokogumbo.c b/ext/nokogumboc/nokogumbo.c
index 3256d6a3..030bd914 100644
--- a/ext/nokogumboc/nokogumbo.c
+++ b/ext/nokogumboc/nokogumbo.c
@@ -218,7 +218,7 @@ static VALUE parse(VALUE self, VALUE string, VALUE max_parse_errors) {
VALUE rdoc = Nokogiri_wrap_xml_document(Document, doc);
// Add parse errors to rdoc.
- if (output->errors.length && max_errors > 0) {
+ if (max_errors > 0 && output->errors.length) {
GumboVector *errors = &output->errors;
GumboParser parser = { ._options = options };
GumboStringBuffer msg;
From c7397aab334fa1a3a3670b02f33daca04ee90524 Mon Sep 17 00:00:00 2001
From: Rafael Masson
Date: Wed, 24 Jan 2018 16:44:16 -0500
Subject: [PATCH 4/4] =?UTF-8?q?Use=20Gumbo=E2=80=99s=20built-in=20`max=5Fe?=
=?UTF-8?q?rrors`=20option?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
ext/nokogumboc/nokogumbo.c | 18 ++++++++++--------
1 file changed, 10 insertions(+), 8 deletions(-)
diff --git a/ext/nokogumboc/nokogumbo.c b/ext/nokogumboc/nokogumbo.c
index 030bd914..46523f37 100644
--- a/ext/nokogumboc/nokogumbo.c
+++ b/ext/nokogumboc/nokogumbo.c
@@ -185,11 +185,13 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboNode *node) {
// Parse a string using gumbo_parse into a Nokogiri document
static VALUE parse(VALUE self, VALUE string, VALUE max_parse_errors) {
- int max_errors = NUM2INT(max_parse_errors);
- const GumboOptions *options = &kGumboDefaultOptions;
+ GumboOptions options;
+ memcpy(&options, &kGumboDefaultOptions, sizeof options);
+ options.max_errors = NUM2INT(max_parse_errors);
+
const char *input = RSTRING_PTR(string);
size_t input_len = RSTRING_LEN(string);
- GumboOutput *output = gumbo_parse_with_options(options, input, input_len);
+ GumboOutput *output = gumbo_parse_with_options(&options, input, input_len);
xmlDocPtr doc = xmlNewDoc(CONST_CAST "1.0");
#ifdef NGLIB
doc->type = XML_HTML_DOCUMENT_NODE;
@@ -218,14 +220,14 @@ static VALUE parse(VALUE self, VALUE string, VALUE max_parse_errors) {
VALUE rdoc = Nokogiri_wrap_xml_document(Document, doc);
// Add parse errors to rdoc.
- if (max_errors > 0 && output->errors.length) {
+ if (output->errors.length) {
GumboVector *errors = &output->errors;
- GumboParser parser = { ._options = options };
+ GumboParser parser = { ._options = &options };
GumboStringBuffer msg;
- VALUE rerrors = rb_ary_new2(errors->length < max_errors ? errors->length : max_errors);
+ VALUE rerrors = rb_ary_new2(errors->length);
gumbo_string_buffer_init(&parser, &msg);
- for (int i=0; i < errors->length && i < max_errors; i++) {
+ for (int i=0; i < errors->length; i++) {
GumboError *err = errors->data[i];
gumbo_string_buffer_clear(&parser, &msg);
// Work around bug in gumbo_caret_diagnostic_to_string.
@@ -254,7 +256,7 @@ static VALUE parse(VALUE self, VALUE string, VALUE max_parse_errors) {
gumbo_string_buffer_destroy(&parser, &msg);
}
- gumbo_destroy_output(options, output);
+ gumbo_destroy_output(&options, output);
return rdoc;
}