(See the previous and initial iteration.)
Intro
This time, I decided to pack the genomic data such that 4 nucleotide bases are encoded into a single byte. In other words, A
is mapped to binary 00
, C
to 01
, G
to 10
, and T
to 11
. For example, the genomic string CATG
will be encoded as 10110001
.
Code
io.github.coderodde.dna.kmerindex.GenomicSequence.java:
package io.github.coderodde.dna.kmerindex;
import java.util.Arrays;
import java.util.Objects;
/**
* This class implements a tightly packed genomic sequence over the nucleotide
* base alphabet <code>A, C, G, T</code>. Each byte, contains 4 nucleotide
* bases.
*
* @version 2.0.0 (Aug 18, 2025)
* @since 1.0.0 (Aug 17, 2025)
*/
public final class GenomicSequence {
private static final int NUCLEOTIDES_PER_BYTE = 4;
private static final int BITS_PER_CODE = 2;
private final byte[] data;
private final int length;
public GenomicSequence(String sequence) {
Objects.requireNonNull(sequence, "The input sequence is null");
data = new byte[sequence.length() / NUCLEOTIDES_PER_BYTE +
(sequence.length() % NUCLEOTIDES_PER_BYTE != 0 ? 1 : 0)];
length = sequence.length();
for (int i = 0; i < sequence.length(); ++i) {
char nucleotideBase = sequence.charAt(i);
checkNucleotideBase(nucleotideBase);
writeToData(nucleotideBase,
i % NUCLEOTIDES_PER_BYTE,
i / NUCLEOTIDES_PER_BYTE);
}
}
public char get(int nucleotideIndex) {
if (nucleotideIndex < 0) {
throw new IndexOutOfBoundsException(
String.format(
"nucleotideIndex(%d) < 0",
nucleotideIndex));
}
if (nucleotideIndex >= length) {
throw new IndexOutOfBoundsException(
String.format(
"nucleotideIndex(%d) >= length(%d)\n",
nucleotideIndex,
length));
}
int byteIndex = nucleotideIndex / NUCLEOTIDES_PER_BYTE;
nucleotideIndex %= NUCLEOTIDES_PER_BYTE;
byte datum = data[byteIndex];
datum >>>= nucleotideIndex * BITS_PER_CODE;
datum &= 0b11;
switch (datum) {
case 0b00 -> {
return 'A';
}
case 0b01 -> {
return 'C';
}
case 0b10 -> {
return 'G';
}
case 0b11 -> {
return 'T';
}
}
throw new IllegalStateException("Should not get here");
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder(length);
for (int i = 0; i < length; ++i) {
sb.append(get(i));
}
return sb.toString();
}
/**
* Extracts a kmer of length {@code k} starting from index
* {@code startIndex}.
*
* @param k the length of the requested <code>k</code>-mer.
* @param startIndex the starting index of the requested <code>k</code>-mer.
* @return the requested <code>k</code>-mer.
*/
public GenomicSequence kmer(int k, int startIndex) {
checkKmerParams(k, startIndex);
StringBuilder sb = new StringBuilder(k);
for (int i = 0; i < k; ++i) {
sb.append(get(startIndex + i));
}
return new GenomicSequence(sb.toString());
}
@Override
public int hashCode() {
return Arrays.hashCode(data);
}
@Override
public boolean equals(Object o) {
if (o == null) {
return false;
}
if (o == this) {
return true;
}
if (!getClass().equals(o.getClass())) {
return false;
}
GenomicSequence other = (GenomicSequence) o;
if (other.length != length) {
return false;
}
return Arrays.equals(data, other.data);
}
public int length() {
return length;
}
private void checkKmerParams(int k, int startIndex) {
if (k < 1) {
String exceptionMessage =
String.format(
"The k-parameter is too small (%d). Must be at least 1",
k);
throw new IllegalArgumentException(exceptionMessage);
}
if (k > length) {
String exceptionMessage = String.format("k(%d) > length(%d)",
k,
length);
throw new IllegalArgumentException(exceptionMessage);
}
if (startIndex + k > length) {
String exceptionMessage =
String.format("startIndex(%d) + k(%d) = %d > length(%d)",
startIndex,
k,
k + startIndex,
length);
throw new IllegalArgumentException(exceptionMessage);
}
}
private static void checkNucleotideBase(char candidate) {
switch (candidate) {
case 'A', 'C', 'G', 'T' -> {
return;
}
}
String exceptionMessage =
String.format(
"Invalid nucleotide base: %d\n",
candidate);
throw new IllegalArgumentException(exceptionMessage);
}
private void writeToData(char nucleotideBase,
int nucleotideIndex,
int byteIndex) {
int index = nucleotideIndex % NUCLEOTIDES_PER_BYTE;
byte mask = encodeNucleotideName(nucleotideBase);
mask <<= index * BITS_PER_CODE;
data[byteIndex] |= mask;
}
private static byte encodeNucleotideName(char nucleotide) {
switch (nucleotide) {
case 'A' -> {
return 0b00;
}
case 'C' -> {
return 0b01;
}
case 'G' -> {
return 0b10;
}
case 'T' -> {
return 0b11;
}
}
throw new IllegalStateException("Should not get here");
}
}
io.github.coderodde.dna.kmerindex.DnaKmerIndex.java:
package io.github.coderodde.dna.kmerindex;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
/**
* This class implements the <code>k</code>-mer index data structures that maps
* each <code>k</code>-mer to the list of indices at which that very
* <code>k</code>-mer appears. The alphabet is restricted to the set of
* nucleotide bases <code>A, C, G, T</pre>.
*
* @version 2.0.0 (Aug 18, 2025)
* @since 1.0.0 (Aug 17, 2025)
*/
public final class DnaKmerIndex {
// Package-private since DnaKmerIndexToStringConverter uses this:
final Map<GenomicSequence, List<Integer>> index = new HashMap<>();
public DnaKmerIndex(GenomicSequence sequence, int k) {
checkArguments(sequence, k);
for (int i = 0; i < sequence.length() - k + 1; ++i) {
GenomicSequence kmer = sequence.kmer(k, i);
if (!index.containsKey(kmer)) {
index.put(kmer, new ArrayList<>());
}
index.get(kmer).add(i);
}
}
public List<Integer> getListOfStartingIndices(GenomicSequence kmer) {
return !index.containsKey(kmer) ?
List.of() :
Collections.unmodifiableList(index.get(kmer));
}
private void checkArguments(GenomicSequence sequence, int k) {
Objects.requireNonNull(sequence, "The input GenomicSequence is null");
if (sequence.length() == 0) {
throw new IllegalArgumentException("Empty sequence");
}
if (k < 1) {
String exceptionMessage =
String.format(
"The k-parameter is too small (%d). Must be at least 1",
k);
throw new IllegalArgumentException(exceptionMessage);
}
if (k > sequence.length()) {
String exceptionMessage =
String.format("k(%d) > sequence.length(%d)",
k,
sequence.length());
throw new IllegalArgumentException(exceptionMessage);
}
}
}
io.github.coderodde.dna.kmerindex.DnaKmerIndexToStringConverter.java:
package io.github.coderodde.dna.kmerindex;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Objects;
/**
* This class is responsible for converting a
* {@link io.github.coderodde.dna.kmerindex.DnaKmerIndex} to a neat string that
* may be arbitrary or sorted.
*
* @version 1.1.0 (Aug 18, 2025)
* @since 1.0.0 (Aug 17, 2025)
*/
public final class DnaKmerIndexToStringConverter {
private boolean sorted = true;
private final DnaKmerIndex index;
public DnaKmerIndexToStringConverter(DnaKmerIndex index, boolean sorted) {
this.index = Objects.requireNonNull(index, "The input index is null");
setSorted(sorted);
}
public DnaKmerIndexToStringConverter(DnaKmerIndex index) {
this(index, true);
}
public void setSorted(boolean sorted) {
this.sorted = sorted;
}
public boolean isSorted() {
return sorted;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
for (Map.Entry<GenomicSequence, List<Integer>> e :
index.index.entrySet()) {
sb.append(e.getKey())
.append(" -> ")
.append(e.getValue())
.append('\n');
}
if (sb.length() > 0) {
sb.setLength(sb.length() - 1);
}
if (sorted) {
String[] lines = sb.toString().split("\n");
Arrays.sort(lines);
return String.join("\n", lines);
}
return sb.toString();
}
}
io.github.coderodde.dna.kmerindex.RandomGenomicSequenceProvider.java:
package io.github.coderodde.dna.kmerindex;
import java.util.Random;
/**
* This class provides a facility for computing a random genetic string.
*
* @version 1.0.0 (Aug 16, 2025)
* @since 1.0.0 (Aug 16, 2025)
*/
public class RandomGenomicSequenceProvider {
private static final char[] NUCLEOTIDE_BASES = { 'A', 'C', 'G', 'T' };
/**
* Generates a random genomic string.
*
* @param length the length of the genomic string.
* @param random the random number generator.
* @return a random genomic string.
*/
public static String generate(int length, Random random) {
StringBuilder sb = new StringBuilder(length);
for (int i = 0; i < length; ++i) {
int nucleotideIndex = random.nextInt(NUCLEOTIDE_BASES.length);
sb.append(NUCLEOTIDE_BASES[nucleotideIndex]);
}
return sb.toString();
}
/**
* Generates a random genomic string.
*
* @param length the length of the genomic string.
* @return a random genomic string.
*/
public static String generate(int length) {
return generate(length, new Random());
}
/**
* Generates a random genomic string.
*
* @param length the length of the genomic string.
* @param seed the seed for the random number generator.
* @return a random genomic string.
*/
public static String generate(int length, long seed) {
return generate(length, new Random(seed));
}
}
io.github.coderodde.dna.kmerindex.demo.Demo.java:
package io.github.coderodde.dna.kmerindex.demo;
import io.github.coderodde.dna.kmerindex.DnaKmerIndex;
import io.github.coderodde.dna.kmerindex.DnaKmerIndexToStringConverter;
import io.github.coderodde.dna.kmerindex.GenomicSequence;
import io.github.coderodde.dna.kmerindex.RandomGenomicSequenceProvider;
/**
* This class provides the demonstration program for the {@code k}-mer index.
*
* @version 1.0.0 (Aug 16, 2025)
* @since 1.0.0 (Aug 16, 2025)
*/
public final class Demo {
private static final int LENGTH = 30;
private static final int K = 2;
public static void main(String[] args) {
long seed = System.currentTimeMillis();
System.out.println("seed = " + seed);
String genomicString = RandomGenomicSequenceProvider.generate(LENGTH,
seed);
System.out.println("Genomic string: " + genomicString);
DnaKmerIndex kmerIndex =
new DnaKmerIndex(new GenomicSequence(genomicString), K);
System.out.println(new DnaKmerIndexToStringConverter(kmerIndex, false));
}
}
io.github.coderodde.dna.kmerindex.DnaKmerIndexTest.java:
package io.github.coderodde.dna.kmerindex;
import java.util.Arrays;
import java.util.List;
import static org.junit.Assert.*;
import org.junit.Test;
public class DnaKmerIndexTest {
@Test
public void getListOfStartingIndices() {
GenomicSequence gs = new GenomicSequence("ACGTAC");
DnaKmerIndex index = new DnaKmerIndex(gs, 2);
List<Integer> list = index.getListOfStartingIndices(gs.kmer(2, 0));
assertEquals(Arrays.asList(0, 4), list);
list = index.getListOfStartingIndices(gs.kmer(2, 1));
assertEquals(Arrays.asList(1), list);
list = index.getListOfStartingIndices(gs.kmer(2, 2));
assertEquals(Arrays.asList(2), list);
list = index.getListOfStartingIndices(new GenomicSequence("CC"));
assertEquals(Arrays.asList(), list);
}
}
io.github.coderodde.dna.kmerindex.GenomicSequenceTest.java:
package io.github.coderodde.dna.kmerindex;
import java.util.Random;
import static org.junit.Assert.assertEquals;
import org.junit.Test;
public class GenomicSequenceTest {
@Test
public void stressTestToString() {
Random random = new Random(100L);
for (int length = 1; length < 20; ++length) {
for (int repeat = 0; repeat < 20; ++repeat) {
String sequence =
RandomGenomicSequenceProvider.generate(length,
random);
GenomicSequence gs = new GenomicSequence(sequence);
assertEquals(sequence, gs.toString());
}
}
}
@Test
public void stressTestKmer() {
Random random = new Random(101L);
for (int length = 1; length < 20; ++length) {
String sequenceString =
RandomGenomicSequenceProvider.generate(length,
random);
GenomicSequence sequence = new GenomicSequence(sequenceString);
for (int k = 1; k <= length; ++k) {
for (int startIndex = 0;
startIndex < length - k + 1;
startIndex++) {
String kmerString = getKmerString(sequenceString,
k,
startIndex);
GenomicSequence kmerSequence = sequence.kmer(k, startIndex);
assertEquals(kmerString, kmerSequence.toString());
}
}
}
}
private static String getKmerString(String sequence,
int k,
int startIndex) {
StringBuilder sb = new StringBuilder(k);
for (int i = startIndex; i < startIndex + k; ++i) {
sb.append(sequence.charAt(i));
}
return sb.toString();
}
}
Typical demo output
seed = 1755516016804
Genomic string: TCGCTTCTCTCCCAACGTCCGGCGGGCAGG
CA -> [12, 26]
AC -> [14]
CC -> [10, 11, 18]
GC -> [2, 21, 25]
TC -> [0, 5, 7, 9, 17]
AG -> [27]
CG -> [1, 15, 19, 22]
GG -> [20, 23, 24, 28]
CT -> [3, 6, 8]
GT -> [16]
TT -> [4]
AA -> [13]
Critique request
Please, tell me anything about to improve my tiny library.