This repository contains the N-Gram Tools for π Phony Language that includes features like sanitizing, tokenization, n-gram extraction, frequency mapping.
Requires PHP >= 8.0.
You can install the package via composer:
composer require phonyland/ngram
$tokenizer->tokenize($text);
β¨οΈ Usage
use Phonyland\NGram\Tokenizer; use Phonyland\NGram\TokenizerFilter; $tokenizer = new Tokenizer(); $tokenizer ->addWordSeparatorPattern(';') ->addWordSeparatorPattern('\s') ->addWordFilterRule(TokenizerFilterType::NO_SYMBOLS); $text = 'sample text;sample;text'; $tokenizer->tokenize($text);
π₯ Output
[ "sample", "text", "sample", "text", ];
$tokenizer->sentences($text);
β¨οΈ Usage
use Phonyland\NGram\Tokenizer; $tokenizer = new Tokenizer(); $tokenizer ->addSentenceSeparatorPattern('.') ->addSentenceSeparatorPattern('!') ->addSentenceSeparatorPattern('?'); $text = 'Sample Sentence. Sample Sentence! Sample Sentence? Sample Sentence no. 4?! Sample sample sentence... End'; $tokenizer->sentences($text);
π₯ Output
[ "Sample Sentence.", "Sample Sentence!", "Sample Sentence?", "Sample Sentence no.", "4?!", "Sample sample sentence...", "End", ];
$tokenizer->tokenizeBySentences($text);
β¨οΈ Usage
use Phonyland\NGram\Tokenizer; use Phonyland\NGram\TokenizerFilter; $tokenizer = new Tokenizer(); $tokenizer ->addSentenceSeparatorPattern('.') ->addSentenceSeparatorPattern('!') ->addSentenceSeparatorPattern('?') ->addWordFilterRule(TokenizerFilterType::NO_SYMBOLS) ->addWordSeparatorPattern(TokenizerFilterType::WHITESPACE_SEPARATOR); $text = 'Sample Sentence. Sample Sentence! Sample Sentence? Sample Sentence no. 4?! Sample sample sentence... End'; $tokenizer->tokenizeBySentences($text);
π₯ Output
[ ["Sample", "Sentence"], ["Sample", "Sentence"], ["Sample", "Sentence"], ["Sample", "Sentence", "no"], ["Sample", "sample", "sentence"], ["End"], ];
NGramSequence::multigram($n, $tokens, $isUnique); NGramSequence::trigram($tokens, $isUnique); NGramSequence::bigram($tokens, $isUnique); NGramSequence::unigram($tokens, $isUnique);
β¨οΈ Usage
use Phonyland\NGram\Tokenizer; use Phonyland\NGram\NGramSequence; use Phonyland\NGram\TokenizerFilter; $tokenizer = new Tokenizer(); $tokenizer->addWordSeparatorPattern(TokenizerFilterType::WHITESPACE_SEPARATOR); $tokens = $tokenizer->tokenize('sample text'); NGramSequence::multigram(4, $tokens); // ['samp', 'ampl', 'mple', 'text']; // Generate Unique N-Grams NGramSequence::unigram($tokens, true); // ['s', 'a', 'm', 'p', 'l', 'e', 't', 'x'];
NGramCount::multigram(4, $tokens); NGramCount::trigram($tokens); NGramCount::bigram($tokens); NGramCount::unigram($tokens); NGramCount::incrementElementCount($element, $elements);
β¨οΈ Usage
use Phonyland\NGram\Tokenizer; use Phonyland\NGram\NGramCount; $tokenizer = new Tokenizer(); $tokenizer->addWordSeparatorPattern(TokenizerFilterType::WHITESPACE_SEPARATOR); $tokens = $tokenizer->tokenize('sample text'); NGramCount::multigram(4, $tokens); // [ // 'samp' => 1, // 'ampl' => 1, // 'mple' => 1, // 'text' => 1, // ];
NGramFrequency::multigram(4, $tokens); NGramFrequency::multigram($tokens); NGramFrequency::bigram($tokens); NGramFrequency::unigram($tokens); NGramFrequency::frequencyFromCount($countArray);
β¨οΈ Usage
use Phonyland\NGram\Tokenizer; use Phonyland\NGram\NGramFrequency; use Phonyland\NGram\TokenizerFilter; $tokenizer = new Tokenizer(); $tokenizer->addWordSeparatorPattern(TokenizerFilterType::WHITESPACE_SEPARATOR); $tokenizer->addWordFilterRule(TokenizerFilterType::ALPHABETICAL); $tokens = $tokenizer->tokenize('bombadil! bombadillo!'); NGramFrequency::multigram(4, $tokens); //[ // 'bomb' => 0.16666666666666666, // 'omba' => 0.16666666666666666, // 'mbad' => 0.16666666666666666, // 'badi' => 0.16666666666666666, // 'adil' => 0.16666666666666666, // 'dill' => 0.08333333333333333, // 'illo' => 0.08333333333333333, //]
Start generating fake data with π Phony Framework,
visit the main Phony Repository .
Explore the docs Β» https://phony.land
Follow us on Twitter Β» @phony_land
π Phony
Fake Data Generation Framework
was created by
Yunus Emre DeligΓΆz
under
MIT license .