split
split(p1 = v1, p2 = v2)Divides str into substrings based on a delimiter, returning an array of these substrings.
If pattern is a String, then its contents are used as the delimiter when splitting str. If pattern is a single space, str is split on whitespace, with leading whitespace and runs of contiguous whitespace characters ignored.
If pattern is a Regexp, str is divided where the pattern matches. Whenever the pattern matches a zero-length string, str is split into individual characters. If pattern contains groups, the respective matches will be returned in the array as well.
If pattern is nil, the value of $; is used. If $; is nil (which is the default), str is split on whitespace as if ‘ ’ were specified.
If the limit parameter is omitted, trailing null fields are suppressed. If limit is a positive number, at most that number of split substrings will be returned (captured groups will be returned as well, but are not counted towards the limit). If limit is 1, the entire string is returned as the only entry in an array. If negative, there is no limit to the number of fields returned, and trailing null fields are not suppressed.
When the input str is empty an empty Array is returned as the string is considered to have no fields to split.
" now's the time".split #=> ["now's", "the", "time"] " now's the time".split ('') #=> ["now's", "the", "time"] " now's the time".split (//) #=> ["", "now's", "", "the", "time"] "1, 2.34,56, 7".split (%r{,\s*}) #=> ["1", "2.34", "56", "7"] "hello".split (//) #=> ["h", "e", "l", "l", "o"] "hello".split (//, 3) #=> ["h", "e", "llo"] "hi mom".split (%r{\s*}) #=> ["h", "i", "m", "o", "m"] "mellow yellow".split ("ello") #=> ["m", "w y", "w"] "1,2,,3,4,,".split (',') #=> ["1", "2", "", "3", "4"] "1,2,,3,4,,".split (',', 4) #=> ["1", "2", "", "3,4,,"] "1,2,,3,4,,".split (',', -4) #=> ["1", "2", "", "3", "4", "", ""] "1:2:3".split (/(:)()()/, 2) #=> ["1", ":", "", "", "2:3"] "".split (',', -1) #=> []
static VALUE
rb_str_split_m(int argc, VALUE *argv, VALUE str)
{
rb_encoding *enc;
VALUE spat;
VALUE limit;
enum {awk, string, regexp} split_type;
long beg, end, i = 0;
int lim = 0;
VALUE result, tmp;
if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
lim = NUM2INT(limit);
if (lim <= 0) limit = Qnil;
else if (lim == 1) {
if (RSTRING_LEN(str) == 0)
return rb_ary_new2(0);
return rb_ary_new3(1, rb_str_dup(str));
}
i = 1;
}
enc = STR_ENC_GET(str);
split_type = regexp;
if (!NIL_P(spat)) {
spat = get_pat_quoted(spat, 0);
}
else if (NIL_P(spat = rb_fs)) {
split_type = awk;
}
else if (!(spat = rb_fs_check(spat))) {
rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
}
if (split_type != awk) {
if (BUILTIN_TYPE(spat) == T_STRING) {
rb_encoding *enc2 = STR_ENC_GET(spat);
mustnot_broken(spat);
split_type = string;
if (RSTRING_LEN(spat) == 0) {
/* Special case - split into chars */
spat = rb_reg_regcomp(spat);
split_type = regexp;
}
else if (rb_enc_asciicompat(enc2) == 1) {
if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' ') {
split_type = awk;
}
}
else {
int l;
if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
RSTRING_LEN(spat) == l) {
split_type = awk;
}
}
}
}
result = rb_ary_new();
beg = 0;
if (split_type == awk) {
char *ptr = RSTRING_PTR(str);
char *eptr = RSTRING_END(str);
char *bptr = ptr;
int skip = 1;
unsigned int c;
end = beg;
if (is_ascii_string(str)) {
while (ptr < eptr) {
c = (unsigned char)*ptr++;
if (skip) {
if (ascii_isspace(c)) {
beg = ptr - bptr;
}
else {
end = ptr - bptr;
skip = 0;
if (!NIL_P(limit) && lim <= i) break;
}
}
else if (ascii_isspace(c)) {
rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
skip = 1;
beg = ptr - bptr;
if (!NIL_P(limit)) ++i;
}
else {
end = ptr - bptr;
}
}
}
else {
while (ptr < eptr) {
int n;
c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
ptr += n;
if (skip) {
if (rb_isspace(c)) {
beg = ptr - bptr;
}
else {
end = ptr - bptr;
skip = 0;
if (!NIL_P(limit) && lim <= i) break;
}
}
else if (rb_isspace(c)) {
rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
skip = 1;
beg = ptr - bptr;
if (!NIL_P(limit)) ++i;
}
else {
end = ptr - bptr;
}
}
}
}
else if (split_type == string) {
char *ptr = RSTRING_PTR(str);
char *str_start = ptr;
char *substr_start = ptr;
char *eptr = RSTRING_END(str);
char *sptr = RSTRING_PTR(spat);
long slen = RSTRING_LEN(spat);
mustnot_broken(str);
enc = rb_enc_check(str, spat);
while (ptr < eptr &&
(end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
/* Check we are at the start of a char */
char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
if (t != ptr + end) {
ptr = t;
continue;
}
rb_ary_push(result, rb_str_subseq(str, substr_start - str_start,
(ptr+end) - substr_start));
ptr += end + slen;
substr_start = ptr;
if (!NIL_P(limit) && lim <= ++i) break;
}
beg = ptr - str_start;
}
else {
char *ptr = RSTRING_PTR(str);
long len = RSTRING_LEN(str);
long start = beg;
long idx;
int last_null = 0;
struct re_registers *regs;
while ((end = rb_reg_search(spat, str, start, 0)) >= 0) {
regs = RMATCH_REGS(rb_backref_get());
if (start == end && BEG(0) == END(0)) {
if (!ptr) {
rb_ary_push(result, str_new_empty(str));
break;
}
else if (last_null == 1) {
rb_ary_push(result, rb_str_subseq(str, beg,
rb_enc_fast_mbclen(ptr+beg,
ptr+len,
enc)));
beg = start;
}
else {
if (start == len)
start++;
else
start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc);
last_null = 1;
continue;
}
}
else {
rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
beg = start = END(0);
}
last_null = 0;
for (idx=1; idx < regs->num_regs; idx++) {
if (BEG(idx) == -1) continue;
if (BEG(idx) == END(idx))
tmp = str_new_empty(str);
else
tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx));
rb_ary_push(result, tmp);
}
if (!NIL_P(limit) && lim <= ++i) break;
}
}
if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
if (RSTRING_LEN(str) == beg)
tmp = str_new_empty(str);
else
tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg);
rb_ary_push(result, tmp);
}
if (NIL_P(limit) && lim == 0) {
long len;
while ((len = RARRAY_LEN(result)) > 0 &&
(tmp = RARRAY_AREF(result, len-1), RSTRING_LEN(tmp) == 0))
rb_ary_pop(result);
}
return result;
} 2Notes
Regexes with groups and split
yonosoytu · Aug 17, 20084 thanksWhen you use a Regex with capture groups, all capture groups are included in the results (interleaved with the "real" results) but they do not count for the limit argument.
Examples:
"abc.,cde.,efg.,ghi".split(/.(,)/)
=> ["abc", ",", "cde", ",", "efg", ",", "ghi"]
"abc.,cde.,efg.,ghi".split(/(.)(,)/)
=> ["abc", ".", ",", "cde", ".", ",", "efg", ".", ",", "ghi"]
"abc.,cde.,efg.,ghi".split(/(.(,))/)
=> ["abc", ".,", ",", "cde", ".,", ",", "efg", ".,", ",", "ghi"]
"abc.,cde.,efg.,ghi".split(/(.(,))/, 2)
=> ["abc", ".,", ",", "cde.,efg.,ghi"]
"abc.,cde.,efg.,ghi".split(/(.(,))/, 3)
=> ["abc", ".,", ",", "cde", ".,", ",", "efg.,ghi"]
clarification of inputs
cartoloupe · Feb 24, 20151 thank"split(p1 = v1, p2 = v2)"
in reading the rest of the documentation, i found "p1" and "p2" to be confusing.
I think it should be:
split( pattern, limit )