I am trying to get an array of all regex matches. For example, something like this:
PATTERN: (\d+)[a-z]+
STRING: 123asd
RESULT: ["123asd", "123"]
^ ^
full capture group 1
Additionally, if there are multiple matches, it should continue matching, for example:
123asd 123asd
[ ["123asd", "123"], ["123asd", "123"] ]
^ ^
match 1 match 2
Here is what I came up with, where I try and create functions to do each of the items (though I haven't yet added a fetch_all()
function):
// pcretest.c
#include <stdio.h>
#include <string.h>
#include <errno.h>
#define PCRE2_CODE_UNIT_WIDTH 8
#include <pcre2.h>
pcre2_code* pcre_compile_pattern(PCRE2_SPTR pattern, uint32_t options)
{
PCRE2_SIZE error_offset;
int error_number;
pcre2_code *re_compiled = pcre2_compile(pattern, PCRE2_ZERO_TERMINATED, options, &error_number, &error_offset, NULL);
if (re_compiled == NULL) {
PCRE2_UCHAR buffer[256];
pcre2_get_error_message(error_number, buffer, sizeof(buffer));
printf("Error: Compiling of pattern '%s' failed at offset %d: %s\n", pattern, (int)error_offset, buffer);
}
return re_compiled;
}
struct match_obj {
int size;
char** matches;
};
// will return the offset of the full-match (or an error-code), and populate the struct match_obj
int get_next_match(pcre2_code *re_compiled, PCRE2_SPTR8 string, struct match_obj *matches, int max_matches)
{
#define MAX_MATCHES_EXCEEDED (-99)
pcre2_match_data *match_data = pcre2_match_data_create_from_pattern(re_compiled, NULL);
int return_code = pcre2_match(re_compiled, string, strlen((char*)string), 0, 0, match_data, NULL);
// Error codes: https://code.woboq.org/qt5/qtbase/src/3rdparty/pcre2/src/pcre2.h.html#313
if (return_code < 0) {
PCRE2_UCHAR buffer[256];
pcre2_get_error_message(return_code, buffer, sizeof(buffer));
if (return_code != PCRE2_ERROR_NOMATCH)
printf("Error trying to match against '%s': %s\n", string, buffer);
return return_code;
}
// Make sure no buffer overflow
if (return_code > max_matches) {
printf("Input buffer is too small.\n");
return MAX_MATCHES_EXCEEDED;
}
PCRE2_SIZE *offset_vector = pcre2_get_ovector_pointer(match_data);
matches->size = return_code;
for (int i=0; i < return_code; i++)
{
PCRE2_SPTR substring_start = string + offset_vector[2*i];
size_t substring_length = offset_vector[2*i+1] - offset_vector[2*i];
char* string = malloc(sizeof *string * substring_length);
strncpy(string, (const char*) substring_start, substring_length);
matches->matches[i] = string;
}
// (start, end) of the full match is the zero'th entry
int end_position = offset_vector[1];
return end_position;
}
int main(void)
{
PCRE2_SPTR8 pattern = (const unsigned char*) "he[al](lo)";
PCRE2_SPTR8 string = (const unsigned char*) "add ello a healo b hello c";
// 1. Compile the pattern
/* uint32_t re_options=0; */
pcre2_code *re_compiled = pcre_compile_pattern(pattern, 0);
// 2. grab matches until expired
#define MAX_MATCH_COMPONENTS 10
#define MAX_TOTAL_MATCHES 10
struct match_obj all_matches[MAX_TOTAL_MATCHES];
int advance_by, total_matches=0;
for (; total_matches < MAX_TOTAL_MATCHES; total_matches++) {
struct match_obj *match_ptr = &all_matches[total_matches];
match_ptr->matches = malloc(sizeof (char*) * MAX_MATCH_COMPONENTS);
advance_by = get_next_match(re_compiled, string, match_ptr, MAX_MATCH_COMPONENTS);
if (advance_by < 0) break;
string += advance_by;
}
// 3. Display them (or do whatever we want with them)
for (int match_num=0; match_num < total_matches; match_num++) {
struct match_obj match = all_matches[match_num];
for (int i=0; i<match.size; i++) printf("Match %d.%d: %s\n", match_num, i, match.matches[i]);
}
// 4. Free the allocations - array of string-pointers here, created-strings in get_next_match()
for (int match_num=0; match_num < total_matches; match_num++) {
for (int i=0; i < all_matches[match_num].size; i++)
free(all_matches[match_num].matches[i]); // free the string
free(all_matches[match_num].matches); // and the array of string pointers
}
}
If helpful, here is the code on OnlineGDB. Note, however, I wasn't able to compile with extra compiler flags to work with #include <pcre2.h>
.
Here are a few specific questions about this:
Figuring out allocations are hard! Does the above look like a sensible approach? I couldn't figure out which function should do what. My first thought was to do everything 'on the stack' in
main
but then the string-pointer array started acting up and so I moved tomalloc
's in main. Is there something like a good rule of thumb for where to do mallocs or how to split them up?Does the data structure look up for returning the matches? I thought an array-of-char* 's would work, though I ended up creating a struct as there were some other things I needed to keep track of (how far it advances, what the matches are, how many matches there are -- though this last item is I think always the same if it's using the same pattern).
-
1\$\begingroup\$ For reviewers: related \$\endgroup\$Mast– Mast ♦2021年04月10日 07:47:27 +00:00Commented Apr 10, 2021 at 7:47
1 Answer 1
Looks generally good, and compiles almost cleanly.
The "almost" is because we return int
from get_next_match
, but try to shoehorn negative error numbers and positive size_t
into it. I'd at least change to return long long
; it's probably better to return the length separately from the status.
We fail to free the pcre2_code
and pcre2_match_data
objects that we get from the PCRE library.
We fail to check the return value where we use malloc()
. Not doing so risks Undefined Behaviour when we dereference it. Easily fixed:
char* string = malloc(substring_length); /* sizeof (char) is necessarily 1 */
if (!string) {
return PCRE2_ERROR_NOMEMORY;
}
match_ptr->matches = malloc(sizeof *match_ptr->matches * MAX_MATCH_COMPONENTS);
if (!match_ptr->matches) {
fprintf(stderr, strerror(ENOMEM));
return EXIT_FAILURE;
}
(We'll need to include <stdlib.h>
to get a definition of EXIT_FAILURE
)
Really, we should clean up properly even if we fail. This will make it easier to re-use this code in a program that can't just exit in this case. As you say, memory management is hard in C!
It's easiest if we define functions to initialise and release our structures:
The other problem we have with memory handling is here:
strncpy(string, (const char*) substring_start, substring_length);
Do you see what's missing there? If not, then run Valgrind again, and look what happens when we print it. We've copied the strings characters, but haven't provided a terminating 0円
, leading to more UB.
char* string = malloc(substring_length + 1);
if (!string) {
return PCRE2_ERROR_NOMEMORY;
}
memcpy(string, substring_start, substring_length);
string[substring_length + 1] = '0円';
I don't like this macro definition buried halfway through a function:
#define MAX_MATCHES_EXCEEDED (-99)
Instead of a macro, define an integer at global scope:
static const int MAX_MATCHES_EXCEEDED = -99;
It's probably better to just allocate the size of results we need, in get_next_match()
where the number is known, instead of having a fixed limit here.
The URL in comment is dead (404).
It's confusing to have a local variable string
that shadows the function argument string
in get_next_match
.
Modified code
This compiles without warnings using gcc -Wall -Wextra -Wwrite-strings -Wno-parentheses -Wpedantic -Warray-bounds -Wconversion -Wstrict-prototypes
, and runs clean under Valgrind.
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define PCRE2_CODE_UNIT_WIDTH 8
#include <pcre2.h>
struct match_obj {
int count;
char** matches;
};
static void match_obj_init(struct match_obj *p)
{
p->count = 0;
p->matches = NULL;
}
static void match_obj_free(struct match_obj *p)
{
if (p) {
for (int i = 0; i < p->count; ++i) {
free(p->matches[i]);
}
free(p->matches);
}
}
pcre2_code* pcre_compile_pattern(PCRE2_SPTR pattern, uint32_t options)
{
PCRE2_SIZE error_offset;
int error_number;
pcre2_code *re_compiled = pcre2_compile(pattern, PCRE2_ZERO_TERMINATED, options, &error_number, &error_offset, NULL);
if (re_compiled == NULL) {
PCRE2_UCHAR buffer[256];
pcre2_get_error_message(error_number, buffer, sizeof buffer);
fprintf(stderr, "Error: Compiling of pattern '%s' failed at offset %zd: %s\n", pattern, error_offset, buffer);
}
return re_compiled;
}
// will return the offset of the full-match (or an error-code), and populate the struct match_obj
int get_next_match(pcre2_code *re_compiled, PCRE2_SPTR8 string,
struct match_obj *matches, size_t *advance)
{
pcre2_match_data *match_data = pcre2_match_data_create_from_pattern(re_compiled, NULL);
int return_code = pcre2_match(re_compiled, string, strlen((char*)string), 0, 0, match_data, NULL);
// Error codes: https://codebrowser.dev/qt5/include/pcre2.h.html#313
if (return_code == PCRE2_ERROR_NOMATCH) {
/* normal */
free(match_data);
return 0;
}
if (return_code < 0) {
free(match_data);
return return_code;
}
PCRE2_SIZE *offset_vector = pcre2_get_ovector_pointer(match_data);
matches->matches = malloc(sizeof *matches->matches * (size_t)return_code);
matches->count = return_code;
if (!matches->matches) {
free(match_data);
return PCRE2_ERROR_NOMEMORY;
}
for (int i = 0; i < matches->count; ++i) {
matches->matches[i] = NULL;
}
for (int i = 0; i < return_code; ++i)
{
PCRE2_SPTR substring_start = string + offset_vector[2*i];
size_t substring_length = offset_vector[2*i+1] - offset_vector[2*i];
char *const s = malloc(substring_length + 1);
if (!s) {
free(match_data);
return PCRE2_ERROR_NOMEMORY;
}
memcpy(s, substring_start, substring_length);
s[substring_length] = '0円';
matches->matches[i] = s;
}
// (start, end) of the full match is the zero'th entry
*advance = offset_vector[1];
pcre2_match_data_free(match_data);
return 0;
}
#define MAX_TOTAL_MATCHES 10
int main(void)
{
PCRE2_SPTR8 pattern = (const unsigned char*) "he[al](lo)";
PCRE2_SPTR8 string = (const unsigned char*) "add ello a healo b hello c";
// 1. Compile the pattern
/* uint32_t re_options=0; */
pcre2_code *re_compiled = pcre_compile_pattern(pattern, 0);
if (!re_compiled) {
return EXIT_FAILURE;
}
// 2. grab matches until expired
struct match_obj all_matches[MAX_TOTAL_MATCHES];
for (int i = 0; i < MAX_TOTAL_MATCHES; ++i) {
match_obj_init(all_matches + i);
}
int total_matches=0;
for (; total_matches < MAX_TOTAL_MATCHES; ++total_matches) {
struct match_obj *match_ptr = &all_matches[total_matches];
size_t advance_by;
int errcode = get_next_match(re_compiled, string, match_ptr, &advance_by);
if (errcode) {
PCRE2_UCHAR buffer[256];
pcre2_get_error_message(errcode, buffer, sizeof buffer);
fprintf(stderr, "Error: Search failed: %s\n", buffer);
return EXIT_FAILURE;
}
string += advance_by;
}
pcre2_code_free(re_compiled);
// 3. Display them (or do whatever we want with them)
for (int match_num = 0; match_num < total_matches; ++match_num) {
struct match_obj match = all_matches[match_num];
for (int i = 0; i < match.count; ++i) {
printf("Match %d.%d: %s\n", match_num, i, match.matches[i]);
}
}
// 4. Free the allocations - array of string-pointers here, created-strings in get_next_match()
for (int match_num = 0; match_num < MAX_TOTAL_MATCHES; ++match_num) {
match_obj_free(all_matches + match_num);
}
}