Ok, so I was searching on Shopifys github account and came across this: https://github.com/Shopify/onigmo/tree/main which was uploaded just five days ago. This means that the code probably has plenty of bugs in it.
I already fuzzed html_tokenizer which was another Shopifys ruby extension. I didn’t find any bugs in it, but it gave me plenty of experience and now I know how to fuzz other ruby extensions too.
Now my initial attempt at writing a fuzzing wrapper was this:
static VALUE thing(VALUE self) {
printf("Called thing");
return Qnil;
}
int main(int argc, char** argv) {
ruby_init();
VALUE cFoo = rb_define_class("Foo", rb_cObject);
rb_define_method(cFoo, "parse", parse, 1);
rb_define_method(cFoo, "compile", compile, 1);
rb_define_method(cFoo, "fuck", thing, 0);
VALUE rb_cOnigmo = rb_define_module("Onigmo");
rb_define_singleton_method(rb_cOnigmo, "parse", parse, 1);
rb_define_singleton_method(rb_cOnigmo, "compile", compile, 1);
//rb_define_method(rb_cOnigmo, "fuck", thing, 0);
VALUE x;
//x = rb_str_new_cstr("<div>"); // Example html string.
x = rb_str_new_cstr("a"); // Simple regex
VALUE obj = rb_class_new_instance(0, NULL, cFoo);
// rb_funcall(obj, rb_intern("parse"), 1, x);
// VALUE obj2 = rb_class_new_instance(0, NULL, rb_cOnigmo);
// obj
rb_funcall(obj, rb_intern("fuck"), 0);
rb_funcall(obj, rb_intern("parse"), 1, x);
//rb_funcall(obj2, rb_intern("fuck"), 0);
//rb_funcall(obj2, rb_intern("parse"), 1, x);
//return 0;
return ruby_cleanup(0);
}
(just append that to the onigmo.c file in the source code)
but it didn’t work. This is because it crashed on this line in build_node: return rb_class_new_instance(2, argv, rb_cOnigmoCallNode);
This is because there is the initialization function which looks like this:
void
Init_onigmo(void) {
VALUE rb_cOnigmo = rb_define_module("Onigmo");
rb_define_singleton_method(rb_cOnigmo, "parse", parse, 1);
rb_define_singleton_method(rb_cOnigmo, "compile", compile, 1);
rb_cOnigmoNode = rb_define_class_under(rb_cOnigmo, "Node", rb_cObject);
rb_cOnigmoAlternationNode = rb_define_class_under(rb_cOnigmo, "AlternationNode", rb_cOnigmoNode);
rb_cOnigmoAnchorBufferBeginNode = rb_define_class_under(rb_cOnigmo, "AnchorBufferBeginNode", rb_cOnigmoNode);
rb_cOnigmoAnchorBufferEndNode = rb_define_class_under(rb_cOnigmo, "AnchorBufferEndNode", rb_cOnigmoNode);
rb_cOnigmoAnchorKeepNode = rb_define_class_under(rb_cOnigmo, "AnchorKeepNode", rb_cOnigmoNode);
rb_cOnigmoAnchorLineBeginNode = rb_define_class_under(rb_cOnigmo, "AnchorLineBeginNode", rb_cOnigmoNode);
rb_cOnigmoAnchorLineEndNode = rb_define_class_under(rb_cOnigmo, "AnchorLineEndNode", rb_cOnigmoNode);
rb_cOnigmoAnchorPositionBeginNode = rb_define_class_under(rb_cOnigmo, "AnchorPositionBeginNode", rb_cOnigmoNode);
rb_cOnigmoAnchorSemiEndNode = rb_define_class_under(rb_cOnigmo, "AnchorSemiEndNode", rb_cOnigmoNode);
rb_cOnigmoAnchorWordBoundaryNode = rb_define_class_under(rb_cOnigmo, "AnchorWordBoundaryNode", rb_cOnigmoNode);
rb_cOnigmoAnchorWordBoundaryInvertNode = rb_define_class_under(rb_cOnigmo, "AnchorWordBoundaryInvertNode", rb_cOnigmoNode);
rb_cOnigmoAnyNode = rb_define_class_under(rb_cOnigmo, "AnyNode", rb_cOnigmoNode);
rb_cOnigmoBackrefNode = rb_define_class_under(rb_cOnigmo, "BackrefNode", rb_cOnigmoNode);
rb_cOnigmoCallNode = rb_define_class_under(rb_cOnigmo, "CallNode", rb_cOnigmoNode);
rb_cOnigmoCClassNode = rb_define_class_under(rb_cOnigmo, "CClassNode", rb_cOnigmoNode);
rb_cOnigmoCClassInvertNode = rb_define_class_under(rb_cOnigmo, "CClassInvertNode", rb_cOnigmoNode);
rb_cOnigmoEncloseAbsentNode = rb_define_class_under(rb_cOnigmo, "EncloseAbsentNode", rb_cOnigmoNode);
rb_cOnigmoEncloseConditionNode = rb_define_class_under(rb_cOnigmo, "EncloseConditionNode", rb_cOnigmoNode);
rb_cOnigmoEncloseMemoryNode = rb_define_class_under(rb_cOnigmo, "EncloseMemoryNode", rb_cOnigmoNode);
rb_cOnigmoEncloseOptionsNode = rb_define_class_under(rb_cOnigmo, "EncloseOptionsNode", rb_cOnigmoNode);
rb_cOnigmoEncloseStopBacktrackNode = rb_define_class_under(rb_cOnigmo, "EncloseStopBacktrackNode", rb_cOnigmoNode);
rb_cOnigmoListNode = rb_define_class_under(rb_cOnigmo, "ListNode", rb_cOnigmoNode);
rb_cOnigmoLookAheadNode = rb_define_class_under(rb_cOnigmo, "LookAheadNode", rb_cOnigmoNode);
rb_cOnigmoLookAheadInvertNode = rb_define_class_under(rb_cOnigmo, "LookAheadInvertNode", rb_cOnigmoNode);
rb_cOnigmoLookBehindNode = rb_define_class_under(rb_cOnigmo, "LookBehindNode", rb_cOnigmoNode);
rb_cOnigmoLookBehindInvertNode = rb_define_class_under(rb_cOnigmo, "LookBehindInvertNode", rb_cOnigmoNode);
rb_cOnigmoQuantifierNode = rb_define_class_under(rb_cOnigmo, "QuantifierNode", rb_cOnigmoNode);
rb_cOnigmoStringNode = rb_define_class_under(rb_cOnigmo, "StringNode", rb_cOnigmoNode);
rb_cOnigmoWordNode = rb_define_class_under(rb_cOnigmo, "WordNode", rb_cOnigmoNode);
rb_cOnigmoWordInvertNode = rb_define_class_under(rb_cOnigmo, "WordInvertNode", rb_cOnigmoNode);
}
so I think that I need to copy all of that to my fuzzing harness and then create an instance of the rb_cOnigmo object and then call parse on that function… let’s see.
Here is my current wrapper:
static VALUE thing(VALUE self) {
printf("Called thing");
return Qnil;
}
int main(int argc, char** argv) {
ruby_init();
VALUE cFoo = rb_define_class("Foo", rb_cObject);
rb_define_method(cFoo, "parse", parse, 1);
rb_define_method(cFoo, "compile", compile, 1);
rb_define_method(cFoo, "fuck", thing, 0);
//VALUE rb_cOnigmo = rb_define_module("Onigmo");
//rb_define_singleton_method(rb_cOnigmo, "parse", parse, 1);
//rb_define_singleton_method(rb_cOnigmo, "compile", compile, 1);
//rb_define_method(rb_cOnigmo, "fuck", thing, 0);
VALUE x;
//x = rb_str_new_cstr("<div>"); // Example html string.
x = rb_str_new_cstr("a"); // Simple regex
VALUE obj = rb_class_new_instance(0, NULL, cFoo);
// rb_funcall(obj, rb_intern("parse"), 1, x);
// VALUE obj2 = rb_class_new_instance(0, NULL, rb_cOnigmo);
// obj
rb_funcall(obj, rb_intern("fuck"), 0);
//rb_funcall(obj, rb_intern("parse"), 1, x);
VALUE rb_cOnigmo = rb_define_module("Onigmo");
rb_define_singleton_method(rb_cOnigmo, "parse", parse, 1);
rb_define_singleton_method(rb_cOnigmo, "compile", compile, 1);
rb_cOnigmoNode = rb_define_class_under(rb_cOnigmo, "Node", rb_cObject);
rb_cOnigmoAlternationNode = rb_define_class_under(rb_cOnigmo, "AlternationNode", rb_cOnigmoNode);
rb_cOnigmoAnchorBufferBeginNode = rb_define_class_under(rb_cOnigmo, "AnchorBufferBeginNode", rb_cOnigmoNode);
rb_cOnigmoAnchorBufferEndNode = rb_define_class_under(rb_cOnigmo, "AnchorBufferEndNode", rb_cOnigmoNode);
rb_cOnigmoAnchorKeepNode = rb_define_class_under(rb_cOnigmo, "AnchorKeepNode", rb_cOnigmoNode);
rb_cOnigmoAnchorLineBeginNode = rb_define_class_under(rb_cOnigmo, "AnchorLineBeginNode", rb_cOnigmoNode);
rb_cOnigmoAnchorLineEndNode = rb_define_class_under(rb_cOnigmo, "AnchorLineEndNode", rb_cOnigmoNode);
rb_cOnigmoAnchorPositionBeginNode = rb_define_class_under(rb_cOnigmo, "AnchorPositionBeginNode", rb_cOnigmoNode);
rb_cOnigmoAnchorSemiEndNode = rb_define_class_under(rb_cOnigmo, "AnchorSemiEndNode", rb_cOnigmoNode);
rb_cOnigmoAnchorWordBoundaryNode = rb_define_class_under(rb_cOnigmo, "AnchorWordBoundaryNode", rb_cOnigmoNode);
rb_cOnigmoAnchorWordBoundaryInvertNode = rb_define_class_under(rb_cOnigmo, "AnchorWordBoundaryInvertNode", rb_cOnigmoNode);
rb_cOnigmoAnyNode = rb_define_class_under(rb_cOnigmo, "AnyNode", rb_cOnigmoNode);
rb_cOnigmoBackrefNode = rb_define_class_under(rb_cOnigmo, "BackrefNode", rb_cOnigmoNode);
rb_cOnigmoCallNode = rb_define_class_under(rb_cOnigmo, "CallNode", rb_cOnigmoNode);
rb_cOnigmoCClassNode = rb_define_class_under(rb_cOnigmo, "CClassNode", rb_cOnigmoNode);
rb_cOnigmoCClassInvertNode = rb_define_class_under(rb_cOnigmo, "CClassInvertNode", rb_cOnigmoNode);
rb_cOnigmoEncloseAbsentNode = rb_define_class_under(rb_cOnigmo, "EncloseAbsentNode", rb_cOnigmoNode);
rb_cOnigmoEncloseConditionNode = rb_define_class_under(rb_cOnigmo, "EncloseConditionNode", rb_cOnigmoNode);
rb_cOnigmoEncloseMemoryNode = rb_define_class_under(rb_cOnigmo, "EncloseMemoryNode", rb_cOnigmoNode);
rb_cOnigmoEncloseOptionsNode = rb_define_class_under(rb_cOnigmo, "EncloseOptionsNode", rb_cOnigmoNode);
rb_cOnigmoEncloseStopBacktrackNode = rb_define_class_under(rb_cOnigmo, "EncloseStopBacktrackNode", rb_cOnigmoNode);
rb_cOnigmoListNode = rb_define_class_under(rb_cOnigmo, "ListNode", rb_cOnigmoNode);
rb_cOnigmoLookAheadNode = rb_define_class_under(rb_cOnigmo, "LookAheadNode", rb_cOnigmoNode);
rb_cOnigmoLookAheadInvertNode = rb_define_class_under(rb_cOnigmo, "LookAheadInvertNode", rb_cOnigmoNode);
rb_cOnigmoLookBehindNode = rb_define_class_under(rb_cOnigmo, "LookBehindNode", rb_cOnigmoNode);
rb_cOnigmoLookBehindInvertNode = rb_define_class_under(rb_cOnigmo, "LookBehindInvertNode", rb_cOnigmoNode);
rb_cOnigmoQuantifierNode = rb_define_class_under(rb_cOnigmo, "QuantifierNode", rb_cOnigmoNode);
rb_cOnigmoStringNode = rb_define_class_under(rb_cOnigmo, "StringNode", rb_cOnigmoNode);
rb_cOnigmoWordNode = rb_define_class_under(rb_cOnigmo, "WordNode", rb_cOnigmoNode);
rb_cOnigmoWordInvertNode = rb_define_class_under(rb_cOnigmo, "WordInvertNode", rb_cOnigmoNode);
//rb_define_method(rb_cOnigmo, "fuck", thing, 0);
VALUE obj3 = rb_class_new_instance(0, NULL, rb_cOnigmo);
// rb_funcall(rb_cOnigmo, rb_intern("parse"), 1, x);
// VALUE obj2 = rb_class_new_instance(0, NULL, rb_cOnigmo);
// obj
//rb_funcall(obj3, rb_intern("fuck"), 0);
//rb_funcall(obj2, rb_intern("fuck"), 0);
//rb_funcall(obj2, rb_intern("parse"), 1, x);
//return 0;
return ruby_cleanup(0);
}
and it crashes on this line: VALUE obj3 = rb_class_new_instance(0, NULL, rb_cOnigmo);
here is the gdb backtrace:
Reading symbols from ./fuzzer...
(gdb) r
Starting program: /home/cyberhacker/Asioita/Hakkerointi/Fuzzing/onigmo/ext/onigmo/fuzzer
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
Program received signal SIGSEGV, Segmentation fault.
0x00007ffff7cbeca9 in ?? () from /lib/x86_64-linux-gnu/libruby-3.0.so.3.0
(gdb) where
#0 0x00007ffff7cbeca9 in ?? () from /lib/x86_64-linux-gnu/libruby-3.0.so.3.0
#1 0x00007ffff7cc4502 in ?? () from /lib/x86_64-linux-gnu/libruby-3.0.so.3.0
#2 0x00007ffff7cc4662 in rb_exc_raise () from /lib/x86_64-linux-gnu/libruby-3.0.so.3.0
#3 0x00007ffff7cbaa0b in ?? () from /lib/x86_64-linux-gnu/libruby-3.0.so.3.0
#4 0x00007ffff7c45ccb in rb_check_type () from /lib/x86_64-linux-gnu/libruby-3.0.so.3.0
#5 0x00007ffff7c46c7b in ?? () from /lib/x86_64-linux-gnu/libruby-3.0.so.3.0
#6 0x000055555555c763 in main (argc=<optimized out>, argv=<optimized out>) at onigmo.c:1149
as you can see, there is a call to rb_check_type which I guess checks the type???? Therefore I think that I am somehow calling the rb_class_new_instance function wrong. See, I am calling the function with a module as argument.
See, if I try to call VALUE obj3 = rb_class_new_instance(0, NULL, rb_cOnigmoLookAheadNode);
it works fine.
I looked absolutely everywhere on how to call a module from c code, but I can’t find one single good tutorial on this. Maybe I should ask a question on stackoverflow?
After doing some code cleanup, I now have this as my wrapper code:
int main(int argc, char** argv) {
// Initialize ruby
ruby_init();
// Define example regex string
VALUE x;
x = rb_str_new_cstr("a"); // Very simple regex which maches an "a" character
// Initialize the Onigmo module
printf("Initializing the Onigmo module...\n");
VALUE rb_cOnigmo = rb_define_module("Onigmo");
rb_define_singleton_method(rb_cOnigmo, "parse", parse, 1);
rb_define_singleton_method(rb_cOnigmo, "compile", compile, 1);
rb_cOnigmoNode = rb_define_class_under(rb_cOnigmo, "Node", rb_cObject);
rb_cOnigmoAlternationNode = rb_define_class_under(rb_cOnigmo, "AlternationNode", rb_cOnigmoNode);
rb_cOnigmoAnchorBufferBeginNode = rb_define_class_under(rb_cOnigmo, "AnchorBufferBeginNode", rb_cOnigmoNode);
rb_cOnigmoAnchorBufferEndNode = rb_define_class_under(rb_cOnigmo, "AnchorBufferEndNode", rb_cOnigmoNode);
rb_cOnigmoAnchorKeepNode = rb_define_class_under(rb_cOnigmo, "AnchorKeepNode", rb_cOnigmoNode);
rb_cOnigmoAnchorLineBeginNode = rb_define_class_under(rb_cOnigmo, "AnchorLineBeginNode", rb_cOnigmoNode);
rb_cOnigmoAnchorLineEndNode = rb_define_class_under(rb_cOnigmo, "AnchorLineEndNode", rb_cOnigmoNode);
rb_cOnigmoAnchorPositionBeginNode = rb_define_class_under(rb_cOnigmo, "AnchorPositionBeginNode", rb_cOnigmoNode);
rb_cOnigmoAnchorSemiEndNode = rb_define_class_under(rb_cOnigmo, "AnchorSemiEndNode", rb_cOnigmoNode);
rb_cOnigmoAnchorWordBoundaryNode = rb_define_class_under(rb_cOnigmo, "AnchorWordBoundaryNode", rb_cOnigmoNode);
rb_cOnigmoAnchorWordBoundaryInvertNode = rb_define_class_under(rb_cOnigmo, "AnchorWordBoundaryInvertNode", rb_cOnigmoNode);
rb_cOnigmoAnyNode = rb_define_class_under(rb_cOnigmo, "AnyNode", rb_cOnigmoNode);
rb_cOnigmoBackrefNode = rb_define_class_under(rb_cOnigmo, "BackrefNode", rb_cOnigmoNode);
rb_cOnigmoCallNode = rb_define_class_under(rb_cOnigmo, "CallNode", rb_cOnigmoNode);
rb_cOnigmoCClassNode = rb_define_class_under(rb_cOnigmo, "CClassNode", rb_cOnigmoNode);
rb_cOnigmoCClassInvertNode = rb_define_class_under(rb_cOnigmo, "CClassInvertNode", rb_cOnigmoNode);
rb_cOnigmoEncloseAbsentNode = rb_define_class_under(rb_cOnigmo, "EncloseAbsentNode", rb_cOnigmoNode);
rb_cOnigmoEncloseConditionNode = rb_define_class_under(rb_cOnigmo, "EncloseConditionNode", rb_cOnigmoNode);
rb_cOnigmoEncloseMemoryNode = rb_define_class_under(rb_cOnigmo, "EncloseMemoryNode", rb_cOnigmoNode);
rb_cOnigmoEncloseOptionsNode = rb_define_class_under(rb_cOnigmo, "EncloseOptionsNode", rb_cOnigmoNode);
rb_cOnigmoEncloseStopBacktrackNode = rb_define_class_under(rb_cOnigmo, "EncloseStopBacktrackNode", rb_cOnigmoNode);
rb_cOnigmoListNode = rb_define_class_under(rb_cOnigmo, "ListNode", rb_cOnigmoNode);
rb_cOnigmoLookAheadNode = rb_define_class_under(rb_cOnigmo, "LookAheadNode", rb_cOnigmoNode);
rb_cOnigmoLookAheadInvertNode = rb_define_class_under(rb_cOnigmo, "LookAheadInvertNode", rb_cOnigmoNode);
rb_cOnigmoLookBehindNode = rb_define_class_under(rb_cOnigmo, "LookBehindNode", rb_cOnigmoNode);
rb_cOnigmoLookBehindInvertNode = rb_define_class_under(rb_cOnigmo, "LookBehindInvertNode", rb_cOnigmoNode);
rb_cOnigmoQuantifierNode = rb_define_class_under(rb_cOnigmo, "QuantifierNode", rb_cOnigmoNode);
rb_cOnigmoStringNode = rb_define_class_under(rb_cOnigmo, "StringNode", rb_cOnigmoNode);
rb_cOnigmoWordNode = rb_define_class_under(rb_cOnigmo, "WordNode", rb_cOnigmoNode);
rb_cOnigmoWordInvertNode = rb_define_class_under(rb_cOnigmo, "WordInvertNode", rb_cOnigmoNode);
printf("Creating instance of rb_cOnigmo...\n");
VALUE module_object = rb_class_new_instance(0, NULL, rb_cOnigmo);
printf("Trying to call \"parse\" on the module...\n");
rb_funcall(module_object, rb_intern("parse"), 1, x);
printf("Done!\n");
return ruby_cleanup(0);
}
Let’s ask a question! Basically we want to accomplish this:
Onigmo.parse("a")
After a bit of formatting, I now have this: https://stackoverflow.com/questions/78215259/how-to-call-a-method-of-a-module-properly-in-ruby-c-api . Feel free to answer if you know what to do.
Now I am just going to wait until someone (hopefully) answers my question…
Ok, so I wasn’t patient enough to wait for an answer, so I modified the source code of the onigmo library to just skip all of the parts where we create an instance of a class, I just returned Qnil instead. Here is the entire onigmo.c file :
#include <ruby.h>
#include <ruby/onigmo.h>
#include <ruby/encoding.h>
#include "regint.h"
#include "regparse.h"
VALUE rb_cOnigmoNode;
VALUE rb_cOnigmoAlternationNode;
VALUE rb_cOnigmoAnchorBufferBeginNode;
VALUE rb_cOnigmoAnchorBufferEndNode;
VALUE rb_cOnigmoAnchorKeepNode;
VALUE rb_cOnigmoAnchorLineBeginNode;
VALUE rb_cOnigmoAnchorLineEndNode;
VALUE rb_cOnigmoAnchorPositionBeginNode;
VALUE rb_cOnigmoAnchorSemiEndNode;
VALUE rb_cOnigmoAnchorWordBoundaryNode;
VALUE rb_cOnigmoAnchorWordBoundaryInvertNode;
VALUE rb_cOnigmoAnyNode;
VALUE rb_cOnigmoBackrefNode;
VALUE rb_cOnigmoCallNode;
VALUE rb_cOnigmoCClassNode;
VALUE rb_cOnigmoCClassInvertNode;
VALUE rb_cOnigmoEncloseAbsentNode;
VALUE rb_cOnigmoEncloseConditionNode;
VALUE rb_cOnigmoEncloseMemoryNode;
VALUE rb_cOnigmoEncloseOptionsNode;
VALUE rb_cOnigmoEncloseStopBacktrackNode;
VALUE rb_cOnigmoListNode;
VALUE rb_cOnigmoLookAheadNode;
VALUE rb_cOnigmoLookAheadInvertNode;
VALUE rb_cOnigmoLookBehindNode;
VALUE rb_cOnigmoLookBehindInvertNode;
VALUE rb_cOnigmoQuantifierNode;
VALUE rb_cOnigmoStringNode;
VALUE rb_cOnigmoWordNode;
VALUE rb_cOnigmoWordInvertNode;
static VALUE
build_options(OnigOptionType option) {
VALUE options = rb_ary_new();
if (option & ONIG_OPTION_NONE) rb_ary_push(options, ID2SYM(rb_intern("none")));
if (option & ONIG_OPTION_IGNORECASE) rb_ary_push(options, ID2SYM(rb_intern("ignorecase")));
if (option & ONIG_OPTION_EXTEND) rb_ary_push(options, ID2SYM(rb_intern("extend")));
if (option & ONIG_OPTION_MULTILINE) rb_ary_push(options, ID2SYM(rb_intern("multiline")));
if (option & ONIG_OPTION_DOTALL) rb_ary_push(options, ID2SYM(rb_intern("dotall")));
if (option & ONIG_OPTION_SINGLELINE) rb_ary_push(options, ID2SYM(rb_intern("singleline")));
if (option & ONIG_OPTION_FIND_LONGEST) rb_ary_push(options, ID2SYM(rb_intern("find_longest")));
if (option & ONIG_OPTION_FIND_NOT_EMPTY) rb_ary_push(options, ID2SYM(rb_intern("find_not_empty")));
if (option & ONIG_OPTION_NEGATE_SINGLELINE) rb_ary_push(options, ID2SYM(rb_intern("negate_singleline")));
if (option & ONIG_OPTION_DONT_CAPTURE_GROUP) rb_ary_push(options, ID2SYM(rb_intern("dont_capture_group")));
if (option & ONIG_OPTION_CAPTURE_GROUP) rb_ary_push(options, ID2SYM(rb_intern("capture_group")));
if (option & ONIG_OPTION_NOTBOL) rb_ary_push(options, ID2SYM(rb_intern("not_bol")));
if (option & ONIG_OPTION_NOTEOL) rb_ary_push(options, ID2SYM(rb_intern("not_eol")));
if (option & ONIG_OPTION_NOTBOS) rb_ary_push(options, ID2SYM(rb_intern("not_bos")));
if (option & ONIG_OPTION_NOTEOS) rb_ary_push(options, ID2SYM(rb_intern("not_eos")));
if (option & ONIG_OPTION_ASCII_RANGE) rb_ary_push(options, ID2SYM(rb_intern("ascii_range")));
if (option & ONIG_OPTION_POSIX_BRACKET_ALL_RANGE) rb_ary_push(options, ID2SYM(rb_intern("posix_bracket_all_range")));
if (option & ONIG_OPTION_WORD_BOUND_ALL_RANGE) rb_ary_push(options, ID2SYM(rb_intern("word_bound_all_range")));
if (option & ONIG_OPTION_NEWLINE_CRLF) rb_ary_push(options, ID2SYM(rb_intern("newline_crlf")));
return options;
}
static VALUE
build_bitset(BitSetRef ref, OnigEncoding encoding) {
VALUE values = rb_ary_new();
for (int index = 0; index < SINGLE_BYTE_SIZE; index++) {
if (BITSET_AT(ref, index) != 0) {
const char character = (const char) index;
rb_ary_push(values, rb_enc_str_new(&character, 1, encoding));
}
}
return values;
}
static VALUE
build_node(Node *node, OnigEncoding encoding) {
int type = NTYPE(node);
switch (type) {
case NT_STR: {
VALUE value = rb_enc_str_new((const char *) NSTR(node)->s, NSTR(node)->end - NSTR(node)->s, encoding);
VALUE argv[] = { value };
return Qnil; // return rb_class_new_instance(1, argv, rb_cOnigmoStringNode);
}
case NT_CCLASS: {
CClassNode* cclass_node = NCCLASS(node);
VALUE values = build_bitset(cclass_node->bs, encoding);
if (cclass_node->mbuf != NULL) {
BBuf *bbuf = cclass_node->mbuf;
OnigCodePoint *data = (OnigCodePoint *) bbuf->p;
OnigCodePoint *end = (OnigCodePoint *) (bbuf->p + bbuf->used);
for (++data; data < end; data += 2) {
for (OnigCodePoint code = data[0]; code < data[1]; code++) {
rb_ary_push(values, INT2NUM(code));
}
}
}
VALUE argv[] = { values };
if (IS_NCCLASS_NOT(cclass_node)) {
return Qnil; // return rb_class_new_instance(1, argv, rb_cOnigmoCClassInvertNode);
} else {
return Qnil; // return rb_class_new_instance(1, argv, rb_cOnigmoCClassNode);
}
}
case NT_CTYPE: {
if (NCTYPE(node)->ctype == ONIGENC_CTYPE_WORD) {
if (NCTYPE(node)->not == 0) {
return Qnil; // return rb_class_new_instance(0, NULL, rb_cOnigmoWordNode);
} else {
return Qnil; // return rb_class_new_instance(0, NULL, rb_cOnigmoWordInvertNode);
}
} else {
RUBY_ASSERT("unknown ctype");
return Qnil;
}
}
case NT_CANY: {
return Qnil; // return rb_class_new_instance(0, NULL, rb_cOnigmoAnyNode);
}
case NT_BREF: {
BRefNode *backref_node = NBREF(node);
int *backrefs = BACKREFS_P(backref_node);
VALUE values = rb_ary_new();
for (int index = 0; index < backref_node->back_num; index++) {
rb_ary_push(values, INT2NUM(backrefs[index]));
}
VALUE argv[] = { values };
return Qnil; // return rb_class_new_instance(1, argv, rb_cOnigmoBackrefNode);
}
case NT_QTFR: {
int lower = NQTFR(node)->lower;
int upper = NQTFR(node)->upper;
VALUE argv[] = {
lower == -1 ? Qnil : INT2NUM(lower),
upper = -1 ? Qnil : INT2NUM(upper),
(NQTFR(node)->greedy ? Qtrue : Qfalse),
build_node(NQTFR(node)->target, encoding)
};
return Qnil; // return rb_class_new_instance(4, argv, rb_cOnigmoQuantifierNode);
}
case NT_ENCLOSE: {
VALUE target = build_node(NENCLOSE(node)->target, encoding);
switch (NENCLOSE(node)->type) {
case ENCLOSE_OPTION: {
VALUE argv[] = { build_options(NENCLOSE(node)->option), target };
return Qnil; // return rb_class_new_instance(2, argv, rb_cOnigmoEncloseOptionsNode);
}
case ENCLOSE_MEMORY: {
VALUE argv[] = { INT2NUM(NENCLOSE(node)->regnum), target };
return Qnil; // return rb_class_new_instance(2, argv, rb_cOnigmoEncloseMemoryNode);
}
case ENCLOSE_STOP_BACKTRACK: {
VALUE argv[] = { target };
return Qnil; // return rb_class_new_instance(1, argv, rb_cOnigmoEncloseStopBacktrackNode);
}
case ENCLOSE_CONDITION: {
VALUE argv[] = { INT2NUM(NENCLOSE(node)->regnum), target };
return Qnil; // return rb_class_new_instance(2, argv, rb_cOnigmoEncloseConditionNode);
}
case ENCLOSE_ABSENT: {
VALUE argv[] = { target };
return Qnil; // return rb_class_new_instance(1, argv, rb_cOnigmoEncloseAbsentNode);
}
default:
RUBY_ASSERT("unknown enclose type");
return Qnil;
}
}
case NT_ANCHOR: {
switch (NANCHOR(node)->type) {
case ANCHOR_BEGIN_BUF:
return Qnil; // return rb_class_new_instance(0, NULL, rb_cOnigmoAnchorBufferBeginNode);
case ANCHOR_END_BUF:
return Qnil; // return rb_class_new_instance(0, NULL, rb_cOnigmoAnchorBufferEndNode);
case ANCHOR_BEGIN_LINE:
return Qnil; // return rb_class_new_instance(0, NULL, rb_cOnigmoAnchorLineBeginNode);
case ANCHOR_END_LINE:
return Qnil; // return rb_class_new_instance(0, NULL, rb_cOnigmoAnchorLineEndNode);
case ANCHOR_SEMI_END_BUF:
return Qnil; // return rb_class_new_instance(0, NULL, rb_cOnigmoAnchorSemiEndNode);
case ANCHOR_BEGIN_POSITION:
return Qnil; // return rb_class_new_instance(0, NULL, rb_cOnigmoAnchorPositionBeginNode);
case ANCHOR_WORD_BOUND:
return Qnil; // return rb_class_new_instance(0, NULL, rb_cOnigmoAnchorWordBoundaryNode);
case ANCHOR_NOT_WORD_BOUND:
return Qnil; // return rb_class_new_instance(0, NULL, rb_cOnigmoAnchorWordBoundaryInvertNode);
case ANCHOR_PREC_READ: {
VALUE target = build_node(NANCHOR(node)->target, encoding);
VALUE argv[] = { target };
return Qnil; // return rb_class_new_instance(1, argv, rb_cOnigmoLookAheadNode);
}
case ANCHOR_PREC_READ_NOT: {
VALUE target = build_node(NANCHOR(node)->target, encoding);
VALUE argv[] = { target };
return Qnil; // return rb_class_new_instance(1, argv, rb_cOnigmoLookAheadInvertNode);
}
case ANCHOR_LOOK_BEHIND: {
VALUE target = build_node(NANCHOR(node)->target, encoding);
VALUE argv[] = { target };
return Qnil; // return rb_class_new_instance(1, argv, rb_cOnigmoLookBehindNode);
}
case ANCHOR_LOOK_BEHIND_NOT: {
VALUE target = build_node(NANCHOR(node)->target, encoding);
VALUE argv[] = { target };
return Qnil; // return rb_class_new_instance(1, argv, rb_cOnigmoLookBehindInvertNode);
}
case ANCHOR_KEEP:
return Qnil; // return rb_class_new_instance(0, NULL, rb_cOnigmoAnchorKeepNode);
default:
RUBY_ASSERT("unknown anchor type");
return Qnil;
}
}
case NT_LIST: {
VALUE nodes = rb_ary_new();
rb_ary_push(nodes, build_node(NCAR(node), encoding));
while (IS_NOT_NULL(node = NCDR(node))) {
RUBY_ASSERT(NTYPE(node) == type);
rb_ary_push(nodes, build_node(NCAR(node), encoding));
}
VALUE argv[] = { nodes };
return Qnil; // return rb_class_new_instance(1, argv, rb_cOnigmoListNode);
}
case NT_ALT: {
VALUE nodes = rb_ary_new();
rb_ary_push(nodes, build_node(NCAR(node), encoding));
while (IS_NOT_NULL(node = NCDR(node))) {
RUBY_ASSERT(NTYPE(node) == type);
rb_ary_push(nodes, build_node(NCAR(node), encoding));
}
VALUE argv[] = { nodes };
return Qnil; // return rb_class_new_instance(1, argv, rb_cOnigmoAlternationNode);
}
case NT_CALL: {
CallNode *call_node = NCALL(node);
VALUE name;
ptrdiff_t length = call_node->name_end - call_node->name;
if (length > 0) {
name = rb_enc_str_new((const char *) call_node->name, length, encoding);
} else {
name = Qnil;
}
VALUE argv[] = { INT2NUM(call_node->group_num), name };
//return Qnil;
return Qnil; // return rb_class_new_instance(2, argv, rb_cOnigmoCallNode);
}
default: {
RUBY_ASSERT("unknown node type");
return Qnil;
}
}
}
static void
fail(int result, regex_t *regex, OnigErrorInfo *einfo) {
OnigUChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
onig_error_code_to_str(message, result, einfo);
onig_free(regex);
//onig_end();
// Do not raise exception.
//rb_raise(rb_eArgError, "%s", message);
}
static VALUE
parse(VALUE self, VALUE string) {
//printf("We are in the parse function!!!\n");
const OnigUChar *pattern = (const OnigUChar *) StringValueCStr(string);
const OnigUChar *pattern_end = pattern + strlen((const char *) pattern);
regex_t *regex = calloc(1, sizeof(regex_t));
if (regex == NULL) {
rb_raise(rb_eNoMemError, "failed to allocate memory");
return Qnil;
}
int result;
OnigEncoding encoding = rb_enc_get(string);
//printf("now calling onig_reg_init!!\n");
if ((result = onig_reg_init(regex, ONIG_OPTION_DEFAULT, ONIGENC_CASE_FOLD_DEFAULT, encoding, ONIG_SYNTAX_DEFAULT)) != ONIG_NORMAL) {
//printf("fail on onig_reg_init!!\n");
fail(result, regex, NULL);
return Qnil;
}
//printf("now calling BBUF_INIT!!\n");
OnigDistance init_size = (pattern_end - pattern) * 2;
result = BBUF_INIT(regex, init_size);
if (result != ONIG_NORMAL) {
fail(result, regex, NULL);
return Qnil;
}
Node *root;
ScanEnv scan_env = { 0 };
//printf("now calling onig_parse_make_tree!!\n");
result = onig_parse_make_tree(&root, pattern, pattern_end, regex, &scan_env);
//printf("now returning from onig_parse_make_tree!!\n");
if (result != ONIG_NORMAL) {
//printf("result != ONIG_NORMAL\n");
fail(result, regex, NULL);
return Qnil;
}
//printf("build_node()\n");
VALUE node = build_node(root, encoding);
//printf("returned from build_node()\n");
onig_node_free(root);
//printf("returned from onig_node_free()\n");
onig_free(regex);
//printf("returned from onig_free(regex)\n");
onig_end();
//printf("returned from onig_end()\n");
return node;
}
static VALUE
read_memnum(const unsigned char **cursor) {
MemNumType memnum = *((MemNumType *) *cursor);
*cursor += SIZE_MEMNUM;
return INT2NUM(memnum);
}
static VALUE
read_reladdr(const unsigned char **cursor) {
RelAddrType address;
GET_RELADDR_INC(address, *cursor);
return INT2NUM(address);
}
static VALUE
read_absaddr(const unsigned char **cursor) {
AbsAddrType address;
GET_ABSADDR_INC(address, *cursor);
return INT2NUM(address);
}
static VALUE
read_exact(const unsigned char **cursor, int length, OnigEncoding encoding) {
VALUE exact = rb_enc_str_new((const char *) *cursor, length, encoding);
*cursor += length;
return exact;
}
static VALUE
read_length(const unsigned char **cursor) {
LengthType length;
GET_LENGTH_INC(length, *cursor);
return INT2NUM(length);
}
static VALUE
read_bitset(const unsigned char **cursor, OnigEncoding encoding) {
VALUE bitset = build_bitset((BitSetRef) (*cursor), encoding);
*cursor += SIZE_BITSET;
return bitset;
}
static VALUE
read_option(const unsigned char **cursor) {
OnigOptionType option = *((OnigOptionType *) cursor);
*cursor += SIZE_OPTION;
return build_options(option);
}
static VALUE
read_state_check(const unsigned char **cursor) {
StateCheckNumType state_check = *((StateCheckNumType *) cursor);
*cursor += SIZE_STATE_CHECK_NUM;
return INT2NUM(state_check);
}
static VALUE
read_codepoint(const unsigned char **cursor, LengthType length) {
const unsigned char *buffer = *cursor;
#ifndef PLATFORM_UNALIGNED_WORD_ACCESS
ALIGNMENT_RIGHT(buffer);
#endif
OnigCodePoint code = *((OnigCodePoint *) buffer);
*cursor += length;
return INT2NUM(code);
}
static VALUE
compile(VALUE self, VALUE string) {
const OnigUChar *pattern = (const OnigUChar *) StringValueCStr(string);
regex_t *regex;
OnigErrorInfo einfo;
OnigEncoding encoding = rb_enc_get(string);
int result = onig_new(®ex, pattern, pattern + strlen((const char *) pattern), ONIG_OPTION_DEFAULT, encoding, ONIG_SYNTAX_DEFAULT, &einfo);
if (result != ONIG_NORMAL) {
fail(result, regex, &einfo);
return Qnil;
}
VALUE insns = rb_ary_new();
const unsigned char *cursor = regex->p;
const unsigned char *end = cursor + regex->used;
LengthType length;
while (cursor < end) {
VALUE insn = rb_ary_new();
switch (*cursor++) {
case OP_FINISH: {
rb_ary_push(insn, ID2SYM(rb_intern("finish")));
break;
}
case OP_END: {
rb_ary_push(insn, ID2SYM(rb_intern("end")));
break;
}
case OP_EXACT1: {
rb_ary_push(insn, ID2SYM(rb_intern("exact1")));
rb_ary_push(insn, read_exact(&cursor, 1, encoding));
break;
}
case OP_EXACT2: {
rb_ary_push(insn, ID2SYM(rb_intern("exact2")));
rb_ary_push(insn, read_exact(&cursor, 2, encoding));
break;
}
case OP_EXACT3: {
rb_ary_push(insn, ID2SYM(rb_intern("exact3")));
rb_ary_push(insn, read_exact(&cursor, 3, encoding));
break;
}
case OP_EXACT4: {
rb_ary_push(insn, ID2SYM(rb_intern("exact4")));
rb_ary_push(insn, read_exact(&cursor, 4, encoding));
break;
}
case OP_EXACT5: {
rb_ary_push(insn, ID2SYM(rb_intern("exact5")));
rb_ary_push(insn, read_exact(&cursor, 5, encoding));
break;
}
case OP_EXACTN: {
rb_ary_push(insn, ID2SYM(rb_intern("exactn")));
rb_ary_push(insn, read_length(&cursor));
break;
}
case OP_EXACTMB2N1: {
rb_ary_push(insn, ID2SYM(rb_intern("exactmb2n1")));
rb_ary_push(insn, read_exact(&cursor, 2, encoding));
break;
}
case OP_EXACTMB2N2: {
rb_ary_push(insn, ID2SYM(rb_intern("exactmb2n2")));
rb_ary_push(insn, read_exact(&cursor, 4, encoding));
break;
}
case OP_EXACTMB2N3: {
rb_ary_push(insn, ID2SYM(rb_intern("exactmb2n3")));
rb_ary_push(insn, read_exact(&cursor, 6, encoding));
break;
}
case OP_EXACTMB2N: {
rb_ary_push(insn, ID2SYM(rb_intern("exactmb2n")));
VALUE length = read_length(&cursor);
rb_ary_push(insn, length);
rb_ary_push(insn, read_exact(&cursor, NUM2INT(length) * 2, encoding));
break;
}
case OP_EXACTMB3N: {
rb_ary_push(insn, ID2SYM(rb_intern("exactmb3n")));
VALUE length = read_length(&cursor);
rb_ary_push(insn, length);
rb_ary_push(insn, read_exact(&cursor, NUM2INT(length) * 3, encoding));
break;
}
case OP_EXACTMBN: {
rb_ary_push(insn, ID2SYM(rb_intern("exactmbn")));
VALUE length = read_length(&cursor);
rb_ary_push(insn, length);
rb_ary_push(insn, read_exact(&cursor, NUM2INT(length) * 2, encoding));
break;
}
case OP_EXACT1_IC: {
rb_ary_push(insn, ID2SYM(rb_intern("exact1_ic")));
length = enclen(encoding, cursor, end);
rb_ary_push(insn, read_exact(&cursor, length, encoding));
break;
}
case OP_EXACTN_IC: {
rb_ary_push(insn, ID2SYM(rb_intern("exactn_ic")));
length = enclen(encoding, cursor, end);
rb_ary_push(insn, read_exact(&cursor, length, encoding));
break;
}
case OP_CCLASS: {
rb_ary_push(insn, ID2SYM(rb_intern("cclass")));
rb_ary_push(insn, read_bitset(&cursor, encoding));
break;
}
case OP_CCLASS_MB: {
rb_ary_push(insn, ID2SYM(rb_intern("cclass_mb")));
VALUE length = read_length(&cursor);
rb_ary_push(insn, length);
rb_ary_push(insn, read_codepoint(&cursor, NUM2INT(length)));
break;
}
case OP_CCLASS_MB_NOT: {
rb_ary_push(insn, ID2SYM(rb_intern("cclass_mb_not")));
VALUE length = read_length(&cursor);
rb_ary_push(insn, length);
rb_ary_push(insn, read_codepoint(&cursor, NUM2INT(length)));
break;
}
case OP_CCLASS_NOT: {
rb_ary_push(insn, ID2SYM(rb_intern("cclass_not")));
rb_ary_push(insn, read_bitset(&cursor, encoding));
break;
}
case OP_CCLASS_MIX: {
rb_ary_push(insn, ID2SYM(rb_intern("cclass_mix")));
rb_ary_push(insn, read_bitset(&cursor, encoding));
VALUE length = read_length(&cursor);
rb_ary_push(insn, length);
rb_ary_push(insn, read_codepoint(&cursor, NUM2INT(length)));
break;
}
case OP_CCLASS_MIX_NOT: {
rb_ary_push(insn, ID2SYM(rb_intern("cclass_mix_not")));
rb_ary_push(insn, read_bitset(&cursor, encoding));
VALUE length = read_length(&cursor);
rb_ary_push(insn, length);
rb_ary_push(insn, read_codepoint(&cursor, NUM2INT(length)));
break;
}
case OP_ANYCHAR: {
rb_ary_push(insn, ID2SYM(rb_intern("anychar")));
break;
}
case OP_ANYCHAR_ML: {
rb_ary_push(insn, ID2SYM(rb_intern("anychar_ml")));
break;
}
case OP_ANYCHAR_STAR: {
rb_ary_push(insn, ID2SYM(rb_intern("anychar_star")));
break;
}
case OP_ANYCHAR_ML_STAR: {
rb_ary_push(insn, ID2SYM(rb_intern("anychar_ml_star")));
break;
}
case OP_ANYCHAR_STAR_PEEK_NEXT: {
rb_ary_push(insn, ID2SYM(rb_intern("anychar_star_peek_next")));
rb_ary_push(insn, read_exact(&cursor, 1, encoding));
break;
}
case OP_ANYCHAR_ML_STAR_PEEK_NEXT: {
rb_ary_push(insn, ID2SYM(rb_intern("anychar_ml_star_peek_next")));
rb_ary_push(insn, read_exact(&cursor, 1, encoding));
break;
}
case OP_WORD: {
rb_ary_push(insn, ID2SYM(rb_intern("word")));
break;
}
case OP_NOT_WORD: {
rb_ary_push(insn, ID2SYM(rb_intern("not_word")));
break;
}
case OP_WORD_BOUND: {
rb_ary_push(insn, ID2SYM(rb_intern("word_bound")));
break;
}
case OP_NOT_WORD_BOUND: {
rb_ary_push(insn, ID2SYM(rb_intern("not_word_bound")));
break;
}
case OP_WORD_BEGIN: {
rb_ary_push(insn, ID2SYM(rb_intern("word_begin")));
break;
}
case OP_WORD_END: {
rb_ary_push(insn, ID2SYM(rb_intern("word_end")));
break;
}
case OP_ASCII_WORD: {
rb_ary_push(insn, ID2SYM(rb_intern("ascii_word")));
break;
}
case OP_NOT_ASCII_WORD: {
rb_ary_push(insn, ID2SYM(rb_intern("not_ascii_word")));
break;
}
case OP_ASCII_WORD_BOUND: {
rb_ary_push(insn, ID2SYM(rb_intern("ascii_word_bound")));
break;
}
case OP_NOT_ASCII_WORD_BOUND: {
rb_ary_push(insn, ID2SYM(rb_intern("not_ascii_word_bound")));
break;
}
case OP_ASCII_WORD_BEGIN: {
rb_ary_push(insn, ID2SYM(rb_intern("ascii_word_begin")));
break;
}
case OP_ASCII_WORD_END: {
rb_ary_push(insn, ID2SYM(rb_intern("ascii_word_end")));
break;
}
case OP_BEGIN_BUF: {
rb_ary_push(insn, ID2SYM(rb_intern("begin_buf")));
break;
}
case OP_END_BUF: {
rb_ary_push(insn, ID2SYM(rb_intern("end_buf")));
break;
}
case OP_BEGIN_LINE: {
rb_ary_push(insn, ID2SYM(rb_intern("begin_line")));
break;
}
case OP_END_LINE: {
rb_ary_push(insn, ID2SYM(rb_intern("end_line")));
break;
}
case OP_SEMI_END_BUF: {
rb_ary_push(insn, ID2SYM(rb_intern("semi_end_buf")));
break;
}
case OP_BEGIN_POSITION: {
rb_ary_push(insn, ID2SYM(rb_intern("begin_position")));
break;
}
case OP_BACKREF1: {
rb_ary_push(insn, ID2SYM(rb_intern("backref1")));
break;
}
case OP_BACKREF2: {
rb_ary_push(insn, ID2SYM(rb_intern("backref2")));
break;
}
case OP_BACKREFN: {
rb_ary_push(insn, ID2SYM(rb_intern("backrefn")));
rb_ary_push(insn, read_memnum(&cursor));
break;
}
case OP_BACKREFN_IC: {
rb_ary_push(insn, ID2SYM(rb_intern("backrefn_ic")));
rb_ary_push(insn, read_memnum(&cursor));
break;
}
case OP_BACKREF_MULTI: {
rb_ary_push(insn, ID2SYM(rb_intern("backref_multi")));
VALUE length = read_length(&cursor);
rb_ary_push(insn, length);
for (int i = 0; i < NUM2INT(length); i++) {
rb_ary_push(insn, read_memnum(&cursor));
}
break;
}
case OP_BACKREF_MULTI_IC: {
rb_ary_push(insn, ID2SYM(rb_intern("backref_multi_ic")));
VALUE length = read_length(&cursor);
rb_ary_push(insn, length);
for (int i = 0; i < NUM2INT(length); i++) {
rb_ary_push(insn, read_memnum(&cursor));
}
break;
}
case OP_BACKREF_WITH_LEVEL: {
rb_ary_push(insn, ID2SYM(rb_intern("backref_with_level")));
rb_ary_push(insn, read_option(&cursor));
rb_ary_push(insn, read_length(&cursor));
VALUE length = read_length(&cursor);
rb_ary_push(insn, length);
for (int i = 0; i < NUM2INT(length); i++) {
rb_ary_push(insn, read_memnum(&cursor));
}
break;
}
case OP_MEMORY_START:
rb_ary_push(insn, ID2SYM(rb_intern("memory_start")));
rb_ary_push(insn, read_memnum(&cursor));
break;
case OP_MEMORY_START_PUSH:
rb_ary_push(insn, ID2SYM(rb_intern("memory_start_push")));
rb_ary_push(insn, read_memnum(&cursor));
break;
case OP_MEMORY_END_PUSH:
rb_ary_push(insn, ID2SYM(rb_intern("memory_end_push")));
rb_ary_push(insn, read_memnum(&cursor));
break;
case OP_MEMORY_END_PUSH_REC:
rb_ary_push(insn, ID2SYM(rb_intern("memory_end_push_rec")));
rb_ary_push(insn, read_memnum(&cursor));
break;
case OP_MEMORY_END:
rb_ary_push(insn, ID2SYM(rb_intern("memory_end")));
rb_ary_push(insn, read_memnum(&cursor));
break;
case OP_MEMORY_END_REC:
rb_ary_push(insn, ID2SYM(rb_intern("memory_end_rec")));
rb_ary_push(insn, read_memnum(&cursor));
break;
case OP_KEEP: {
rb_ary_push(insn, ID2SYM(rb_intern("keep")));
break;
}
case OP_FAIL: {
rb_ary_push(insn, ID2SYM(rb_intern("fail")));
break;
}
case OP_JUMP: {
rb_ary_push(insn, ID2SYM(rb_intern("jump")));
rb_ary_push(insn, read_reladdr(&cursor));
break;
}
case OP_PUSH: {
rb_ary_push(insn, ID2SYM(rb_intern("push")));
rb_ary_push(insn, read_reladdr(&cursor));
break;
}
case OP_POP: {
rb_ary_push(insn, ID2SYM(rb_intern("pop")));
break;
}
case OP_PUSH_OR_JUMP_EXACT1: {
rb_ary_push(insn, ID2SYM(rb_intern("push_or_jump_exact1")));
rb_ary_push(insn, read_reladdr(&cursor));
rb_ary_push(insn, read_exact(&cursor, 1, encoding));
break;
}
case OP_PUSH_IF_PEEK_NEXT: {
rb_ary_push(insn, ID2SYM(rb_intern("push_if_peek_next")));
rb_ary_push(insn, read_reladdr(&cursor));
rb_ary_push(insn, read_exact(&cursor, 1, encoding));
break;
}
case OP_REPEAT: {
rb_ary_push(insn, ID2SYM(rb_intern("repeat")));
rb_ary_push(insn, read_memnum(&cursor));
rb_ary_push(insn, read_reladdr(&cursor));
break;
}
case OP_REPEAT_NG: {
rb_ary_push(insn, ID2SYM(rb_intern("repeat_ng")));
rb_ary_push(insn, read_memnum(&cursor));
rb_ary_push(insn, read_reladdr(&cursor));
break;
}
case OP_REPEAT_INC: {
rb_ary_push(insn, ID2SYM(rb_intern("repeat_inc")));
rb_ary_push(insn, read_memnum(&cursor));
break;
}
case OP_REPEAT_INC_NG: {
rb_ary_push(insn, ID2SYM(rb_intern("repeat_inc_ng")));
rb_ary_push(insn, read_memnum(&cursor));
break;
}
case OP_REPEAT_INC_SG: {
rb_ary_push(insn, ID2SYM(rb_intern("repeat_inc_sg")));
rb_ary_push(insn, read_memnum(&cursor));
break;
}
case OP_REPEAT_INC_NG_SG: {
rb_ary_push(insn, ID2SYM(rb_intern("repeat_inc_ng_sg")));
rb_ary_push(insn, read_memnum(&cursor));
break;
}
case OP_NULL_CHECK_START: {
rb_ary_push(insn, ID2SYM(rb_intern("null_check_start")));
rb_ary_push(insn, read_memnum(&cursor));
break;
}
case OP_NULL_CHECK_END: {
rb_ary_push(insn, ID2SYM(rb_intern("null_check_end")));
rb_ary_push(insn, read_memnum(&cursor));
break;
}
case OP_NULL_CHECK_END_MEMST: {
rb_ary_push(insn, ID2SYM(rb_intern("null_check_end_memst")));
rb_ary_push(insn, read_memnum(&cursor));
break;
}
case OP_NULL_CHECK_END_MEMST_PUSH: {
rb_ary_push(insn, ID2SYM(rb_intern("null_check_end_memst_push")));
rb_ary_push(insn, read_memnum(&cursor));
break;
}
case OP_PUSH_POS: {
rb_ary_push(insn, ID2SYM(rb_intern("push_pos")));
break;
}
case OP_POP_POS: {
rb_ary_push(insn, ID2SYM(rb_intern("pop_pos")));
break;
}
case OP_PUSH_POS_NOT: {
rb_ary_push(insn, ID2SYM(rb_intern("push_pos_not")));
rb_ary_push(insn, read_reladdr(&cursor));
break;
}
case OP_FAIL_POS: {
rb_ary_push(insn, ID2SYM(rb_intern("fail_pos")));
break;
}
case OP_PUSH_STOP_BT: {
rb_ary_push(insn, ID2SYM(rb_intern("push_stop_bt")));
break;
}
case OP_POP_STOP_BT: {
rb_ary_push(insn, ID2SYM(rb_intern("pop_stop_bt")));
break;
}
case OP_LOOK_BEHIND: {
rb_ary_push(insn, ID2SYM(rb_intern("look_behind")));
rb_ary_push(insn, read_length(&cursor));
break;
}
case OP_PUSH_LOOK_BEHIND_NOT: {
rb_ary_push(insn, ID2SYM(rb_intern("push_look_behind_not")));
rb_ary_push(insn, read_reladdr(&cursor));
rb_ary_push(insn, read_length(&cursor));
break;
}
case OP_FAIL_LOOK_BEHIND_NOT: {
rb_ary_push(insn, ID2SYM(rb_intern("fail_look_behind_not")));
break;
}
case OP_PUSH_ABSENT_POS: {
rb_ary_push(insn, ID2SYM(rb_intern("push_absent_pos")));
break;
}
case OP_ABSENT: {
rb_ary_push(insn, ID2SYM(rb_intern("absent")));
rb_ary_push(insn, read_reladdr(&cursor));
break;
}
case OP_ABSENT_END: {
rb_ary_push(insn, ID2SYM(rb_intern("absent_end")));
break;
}
case OP_CALL: {
rb_ary_push(insn, ID2SYM(rb_intern("call")));
rb_ary_push(insn, read_absaddr(&cursor));
break;
}
case OP_RETURN: {
rb_ary_push(insn, ID2SYM(rb_intern("return")));
break;
}
case OP_CONDITION: {
rb_ary_push(insn, ID2SYM(rb_intern("condition")));
rb_ary_push(insn, read_memnum(&cursor));
rb_ary_push(insn, read_reladdr(&cursor));
break;
}
case OP_STATE_CHECK_PUSH: {
rb_ary_push(insn, ID2SYM(rb_intern("state_check_push")));
rb_ary_push(insn, read_state_check(&cursor));
rb_ary_push(insn, read_reladdr(&cursor));
break;
}
case OP_STATE_CHECK_PUSH_OR_JUMP: {
rb_ary_push(insn, ID2SYM(rb_intern("state_check_push_or_jump")));
rb_ary_push(insn, read_state_check(&cursor));
rb_ary_push(insn, read_reladdr(&cursor));
break;
}
case OP_STATE_CHECK: {
rb_ary_push(insn, ID2SYM(rb_intern("state_check")));
rb_ary_push(insn, read_state_check(&cursor));
break;
}
case OP_STATE_CHECK_ANYCHAR_STAR: {
rb_ary_push(insn, ID2SYM(rb_intern("state_check_anychar_star")));
rb_ary_push(insn, read_state_check(&cursor));
break;
}
case OP_STATE_CHECK_ANYCHAR_ML_STAR: {
rb_ary_push(insn, ID2SYM(rb_intern("state_check_anychar_ml_star")));
rb_ary_push(insn, read_state_check(&cursor));
break;
}
case OP_SET_OPTION_PUSH: {
rb_ary_push(insn, ID2SYM(rb_intern("set_option_push")));
rb_ary_push(insn, read_option(&cursor));
break;
}
case OP_SET_OPTION: {
rb_ary_push(insn, ID2SYM(rb_intern("set_option")));
rb_ary_push(insn, read_option(&cursor));
break;
}
}
rb_ary_push(insns, insn);
}
onig_free(regex);
//onig_end();
return insns;
}
/*
void
Init_onigmo(void) {
VALUE rb_cOnigmo = rb_define_module("Onigmo");
rb_define_singleton_method(rb_cOnigmo, "parse", parse, 1);
rb_define_singleton_method(rb_cOnigmo, "compile", compile, 1);
rb_cOnigmoNode = rb_define_class(rb_cOnigmo, "Node", rb_cObject);
rb_cOnigmoAlternationNode = rb_define_class(rb_cOnigmo, "AlternationNode");
rb_cOnigmoAnchorBufferBeginNode = rb_define_class(rb_cOnigmo, "AnchorBufferBeginNode");
rb_cOnigmoAnchorBufferEndNode = rb_define_class(rb_cOnigmo, "AnchorBufferEndNode");
rb_cOnigmoAnchorKeepNode = rb_define_class(rb_cOnigmo, "AnchorKeepNode");
rb_cOnigmoAnchorLineBeginNode = rb_define_class(rb_cOnigmo, "AnchorLineBeginNode");
rb_cOnigmoAnchorLineEndNode = rb_define_class(rb_cOnigmo, "AnchorLineEndNode");
rb_cOnigmoAnchorPositionBeginNode = rb_define_class(rb_cOnigmo, "AnchorPositionBeginNode");
rb_cOnigmoAnchorSemiEndNode = rb_define_class(rb_cOnigmo, "AnchorSemiEndNode");
rb_cOnigmoAnchorWordBoundaryNode = rb_define_class(rb_cOnigmo, "AnchorWordBoundaryNode");
rb_cOnigmoAnchorWordBoundaryInvertNode = rb_define_class(rb_cOnigmo, "AnchorWordBoundaryInvertNode");
rb_cOnigmoAnyNode = rb_define_class(rb_cOnigmo, "AnyNode");
rb_cOnigmoBackrefNode = rb_define_class(rb_cOnigmo, "BackrefNode");
rb_cOnigmoCallNode = rb_define_class(rb_cOnigmo, "CallNode");
rb_cOnigmoCClassNode = rb_define_class(rb_cOnigmo, "CClassNode");
rb_cOnigmoCClassInvertNode = rb_define_class(rb_cOnigmo, "CClassInvertNode");
rb_cOnigmoEncloseAbsentNode = rb_define_class(rb_cOnigmo, "EncloseAbsentNode");
rb_cOnigmoEncloseConditionNode = rb_define_class(rb_cOnigmo, "EncloseConditionNode");
rb_cOnigmoEncloseMemoryNode = rb_define_class(rb_cOnigmo, "EncloseMemoryNode");
rb_cOnigmoEncloseOptionsNode = rb_define_class(rb_cOnigmo, "EncloseOptionsNode");
rb_cOnigmoEncloseStopBacktrackNode = rb_define_class(rb_cOnigmo, "EncloseStopBacktrackNode");
rb_cOnigmoListNode = rb_define_class(rb_cOnigmo, "ListNode");
rb_cOnigmoLookAheadNode = rb_define_class(rb_cOnigmo, "LookAheadNode");
rb_cOnigmoLookAheadInvertNode = rb_define_class(rb_cOnigmo, "LookAheadInvertNode");
rb_cOnigmoLookBehindNode = rb_define_class(rb_cOnigmo, "LookBehindNode");
rb_cOnigmoLookBehindInvertNode = rb_define_class(rb_cOnigmo, "LookBehindInvertNode");
rb_cOnigmoQuantifierNode = rb_define_class(rb_cOnigmo, "QuantifierNode");
rb_cOnigmoStringNode = rb_define_class(rb_cOnigmo, "StringNode");
rb_cOnigmoWordNode = rb_define_class(rb_cOnigmo, "WordNode");
rb_cOnigmoWordInvertNode = rb_define_class(rb_cOnigmo, "WordInvertNode");
}
*/
/*
// Main fuzzer
#define FUZZ_LOOP_COUNT 100000
__AFL_FUZZ_INIT();
static VALUE mHtmlTokenizer = Qnil;
int main(int argc, char** argv) {
ruby_init();
__AFL_INIT();
unsigned char *buf = __AFL_FUZZ_TESTCASE_BUF;
while (__AFL_LOOP(FUZZ_LOOP_COUNT)) {
int len = __AFL_FUZZ_TESTCASE_LEN;
VALUE cFoo = rb_define_class("Foo", rb_cObject);
rb_define_alloc_func(cFoo, parser_allocate);
rb_define_method(cFoo, "initialize", parser_initialize_method, 0); // One argument
// Now define the other method:
// rb_define_method(cParser, "parse", parser_parse_method, 1);
rb_define_method(cFoo, "parse", parser_parse_method, 1);
VALUE x;
//x = rb_str_new_cstr("<div>"); // Example html string.
x = rb_str_new_cstr(buf); // Create string from fuzz buffer
VALUE obj = rb_class_new_instance(0, NULL, cFoo);
rb_funcall(obj, rb_intern("initialize"), 0);
// Now try to parse.
//printf("Now trying to call parse!!!!\n");
//printf("Here is the buffer %s\n", buf);
rb_funcall(obj, rb_intern("parse"), 1, x);
//printf("Done!\n");
}
return ruby_cleanup(0);
}
*/
#define FUZZ_LOOP_COUNT 100000
__AFL_FUZZ_INIT();
int main(int argc, char** argv) {
// Initialize ruby
ruby_init();
VALUE x;
// Define example regex string
// Initialize the Onigmo module
//printf("Initializing the Onigmo module...\n");
// VALUE rb_cOnigmo = rb_define_module("Onigmo");
VALUE rb_cOnigmo = rb_define_class("Foo", rb_cObject);
// rb_define_singleton_method(rb_cOnigmo, "parse", parse, 1);
// rb_define_singleton_method(rb_cOnigmo, "compile", compile, 1);
rb_define_method(rb_cOnigmo, "parse", parse, 1);
rb_define_method(rb_cOnigmo, "compile", compile, 1);
/*
rb_cOnigmoNode = rb_define_class(rb_cOnigmo, "Node", rb_cObject);
rb_cOnigmoAlternationNode = rb_define_class(rb_cOnigmo, "AlternationNode");
rb_cOnigmoAnchorBufferBeginNode = rb_define_class(rb_cOnigmo, "AnchorBufferBeginNode");
rb_cOnigmoAnchorBufferEndNode = rb_define_class(rb_cOnigmo, "AnchorBufferEndNode");
rb_cOnigmoAnchorKeepNode = rb_define_class(rb_cOnigmo, "AnchorKeepNode");
rb_cOnigmoAnchorLineBeginNode = rb_define_class(rb_cOnigmo, "AnchorLineBeginNode");
rb_cOnigmoAnchorLineEndNode = rb_define_class(rb_cOnigmo, "AnchorLineEndNode");
rb_cOnigmoAnchorPositionBeginNode = rb_define_class(rb_cOnigmo, "AnchorPositionBeginNode");
rb_cOnigmoAnchorSemiEndNode = rb_define_class(rb_cOnigmo, "AnchorSemiEndNode");
rb_cOnigmoAnchorWordBoundaryNode = rb_define_class(rb_cOnigmo, "AnchorWordBoundaryNode");
rb_cOnigmoAnchorWordBoundaryInvertNode = rb_define_class(rb_cOnigmo, "AnchorWordBoundaryInvertNode");
rb_cOnigmoAnyNode = rb_define_class(rb_cOnigmo, "AnyNode");
rb_cOnigmoBackrefNode = rb_define_class(rb_cOnigmo, "BackrefNode");
rb_cOnigmoCallNode = rb_define_class(rb_cOnigmo, "CallNode");
rb_cOnigmoCClassNode = rb_define_class(rb_cOnigmo, "CClassNode");
rb_cOnigmoCClassInvertNode = rb_define_class(rb_cOnigmo, "CClassInvertNode");
rb_cOnigmoEncloseAbsentNode = rb_define_class(rb_cOnigmo, "EncloseAbsentNode");
rb_cOnigmoEncloseConditionNode = rb_define_class(rb_cOnigmo, "EncloseConditionNode");
rb_cOnigmoEncloseMemoryNode = rb_define_class(rb_cOnigmo, "EncloseMemoryNode");
rb_cOnigmoEncloseOptionsNode = rb_define_class(rb_cOnigmo, "EncloseOptionsNode");
rb_cOnigmoEncloseStopBacktrackNode = rb_define_class(rb_cOnigmo, "EncloseStopBacktrackNode");
rb_cOnigmoListNode = rb_define_class(rb_cOnigmo, "ListNode");
rb_cOnigmoLookAheadNode = rb_define_class(rb_cOnigmo, "LookAheadNode");
rb_cOnigmoLookAheadInvertNode = rb_define_class(rb_cOnigmo, "LookAheadInvertNode");
rb_cOnigmoLookBehindNode = rb_define_class(rb_cOnigmo, "LookBehindNode");
rb_cOnigmoLookBehindInvertNode = rb_define_class(rb_cOnigmo, "LookBehindInvertNode");
rb_cOnigmoQuantifierNode = rb_define_class(rb_cOnigmo, "QuantifierNode");
rb_cOnigmoStringNode = rb_define_class(rb_cOnigmo, "StringNode");
rb_cOnigmoWordNode = rb_define_class(rb_cOnigmo, "WordNode");
rb_cOnigmoWordInvertNode = rb_define_class(rb_cOnigmo, "WordInvertNode");
*/
rb_cOnigmoNode = rb_define_class("Node", rb_cObject);
rb_cOnigmoAlternationNode = rb_define_class("AlternationNode", rb_cObject);
rb_cOnigmoAnchorBufferBeginNode = rb_define_class("AnchorBufferBeginNode", rb_cObject);
rb_cOnigmoAnchorBufferEndNode = rb_define_class("AnchorBufferEndNode", rb_cObject);
rb_cOnigmoAnchorKeepNode = rb_define_class("AnchorKeepNode", rb_cObject);
rb_cOnigmoAnchorLineBeginNode = rb_define_class("AnchorLineBeginNode", rb_cObject);
rb_cOnigmoAnchorLineEndNode = rb_define_class("AnchorLineEndNode", rb_cObject);
rb_cOnigmoAnchorPositionBeginNode = rb_define_class("AnchorPositionBeginNode", rb_cObject);
rb_cOnigmoAnchorSemiEndNode = rb_define_class("AnchorSemiEndNode", rb_cObject);
rb_cOnigmoAnchorWordBoundaryNode = rb_define_class("AnchorWordBoundaryNode", rb_cObject);
rb_cOnigmoAnchorWordBoundaryInvertNode = rb_define_class("AnchorWordBoundaryInvertNode", rb_cObject);
rb_cOnigmoAnyNode = rb_define_class("AnyNode", rb_cObject);
rb_cOnigmoBackrefNode = rb_define_class("BackrefNode", rb_cObject);
rb_cOnigmoCallNode = rb_define_class("CallNode", rb_cObject);
rb_cOnigmoCClassNode = rb_define_class("CClassNode", rb_cObject);
rb_cOnigmoCClassInvertNode = rb_define_class("CClassInvertNode", rb_cObject);
rb_cOnigmoEncloseAbsentNode = rb_define_class("EncloseAbsentNode", rb_cObject);
rb_cOnigmoEncloseConditionNode = rb_define_class("EncloseConditionNode", rb_cObject);
rb_cOnigmoEncloseMemoryNode = rb_define_class("EncloseMemoryNode", rb_cObject);
rb_cOnigmoEncloseOptionsNode = rb_define_class("EncloseOptionsNode", rb_cObject);
rb_cOnigmoEncloseStopBacktrackNode = rb_define_class("EncloseStopBacktrackNode", rb_cObject);
rb_cOnigmoListNode = rb_define_class("ListNode", rb_cObject);
rb_cOnigmoLookAheadNode = rb_define_class("LookAheadNode", rb_cObject);
rb_cOnigmoLookAheadInvertNode = rb_define_class("LookAheadInvertNode", rb_cObject);
rb_cOnigmoLookBehindNode = rb_define_class("LookBehindNode", rb_cObject);
rb_cOnigmoLookBehindInvertNode = rb_define_class("LookBehindInvertNode", rb_cObject);
rb_cOnigmoQuantifierNode = rb_define_class("QuantifierNode", rb_cObject);
rb_cOnigmoStringNode = rb_define_class("StringNode", rb_cObject);
rb_cOnigmoWordNode = rb_define_class("WordNode", rb_cObject);
rb_cOnigmoWordInvertNode = rb_define_class("WordInvertNode", rb_cObject);
// rb_cOnigmo is the module thing.
//unsigned char paska[10000];
//read(0, paska, 10000-1);
//printf("Here is the buffer contents: %s\n", paska);
// Main fuzzing loop.
VALUE module_object = rb_class_new_instance(0, NULL, rb_cOnigmo);
#define BUF_SIZE 10000
unsigned char paska[BUF_SIZE];
__AFL_INIT();
unsigned char *buf = __AFL_FUZZ_TESTCASE_BUF;
//while (true) {
while (__AFL_LOOP(FUZZ_LOOP_COUNT)) {
// Zero out the buffer
memset(paska, 0, BUF_SIZE);
// Now read into buffer.
int len = __AFL_FUZZ_TESTCASE_LEN;
if (len >= BUF_SIZE) {
//read(0, paska, BUF_SIZE-1); // Must be null terminated
memcpy(paska, buf, BUF_SIZE-1);
} else {
memcpy(paska, buf, len);
//read(0, paska, len);
}
//x = rb_str_new_cstr(buf); // Convert to ruby string object.
x = rb_str_new_cstr(paska);
//printf("Creating instance of rb_cOnigmo...\n");
// VALUE module_object = rb_class_new_instance(0, NULL, rb_cOnigmo);
//printf("Fuck!!!\n");
///printf("Trying to call \"parse\" on the module...\n");
// rb_funcall(module_object, rb_intern("parse"), 1, x);
//printf("Trying to compile...\n");
// compile
rb_funcall(module_object, rb_intern("parse"), 1, x);
// Try to compile the regex
rb_funcall(module_object, rb_intern("compile"), 1, x);
//printf("Done!\n");
}
// Cleanup
return ruby_cleanup(0);
}
and it seems to fuzz just fine.