Funny time with DNA: a \$k\$-mer index data structure in Java, Take II

Question

(See the previous and initial iteration.)

Intro

This time, I decided to pack the genomic data such that 4 nucleotide bases are encoded into a single byte. In other words, A is mapped to binary 00, C to 01, G to 10, and T to 11. For example, the genomic string CATG will be encoded as 10110001.

Code

io.github.coderodde.dna.kmerindex.GenomicSequence.java:

package io.github.coderodde.dna.kmerindex;
import java.util.Arrays;
import java.util.Objects;
/**
 * This class implements a tightly packed genomic sequence over the nucleotide
 * base alphabet <code>A, C, G, T</code>. Each byte, contains 4 nucleotide
 * bases.
 * 
 * @version 2.0.0 (Aug 18, 2025)
 * @since 1.0.0 (Aug 17, 2025)
 */
public final class GenomicSequence {
 
 private static final int NUCLEOTIDES_PER_BYTE = 4;
 private static final int BITS_PER_CODE = 2;
 
 private final byte[] data;
 private final int length;
 
 public GenomicSequence(String sequence) {
 Objects.requireNonNull(sequence, "The input sequence is null");
 data = new byte[sequence.length() / NUCLEOTIDES_PER_BYTE +
 (sequence.length() % NUCLEOTIDES_PER_BYTE != 0 ? 1 : 0)];
 
 length = sequence.length();
 
 for (int i = 0; i < sequence.length(); ++i) {
 char nucleotideBase = sequence.charAt(i);
 checkNucleotideBase(nucleotideBase);
 
 writeToData(nucleotideBase,
 i % NUCLEOTIDES_PER_BYTE,
 i / NUCLEOTIDES_PER_BYTE);
 }
 }
 
 public char get(int nucleotideIndex) {
 if (nucleotideIndex < 0) {
 throw new IndexOutOfBoundsException(
 String.format(
 "nucleotideIndex(%d) < 0",
 nucleotideIndex));
 }
 
 if (nucleotideIndex >= length) {
 throw new IndexOutOfBoundsException(
 String.format(
 "nucleotideIndex(%d) >= length(%d)\n", 
 nucleotideIndex, 
 length));
 }
 
 int byteIndex = nucleotideIndex / NUCLEOTIDES_PER_BYTE;
 nucleotideIndex %= NUCLEOTIDES_PER_BYTE;
 
 byte datum = data[byteIndex];
 datum >>>= nucleotideIndex * BITS_PER_CODE;
 datum &= 0b11;
 
 switch (datum) {
 case 0b00 -> {
 return 'A';
 }
 
 case 0b01 -> {
 return 'C';
 }
 
 case 0b10 -> {
 return 'G';
 }
 
 case 0b11 -> {
 return 'T'; 
 }
 }
 
 throw new IllegalStateException("Should not get here");
 }
 
 @Override
 public String toString() {
 StringBuilder sb = new StringBuilder(length);
 
 for (int i = 0; i < length; ++i) {
 sb.append(get(i));
 }
 
 return sb.toString();
 }
 
 /**
 * Extracts a kmer of length {@code k} starting from index 
 * {@code startIndex}.
 * 
 * @param k the length of the requested <code>k</code>-mer.
 * @param startIndex the starting index of the requested <code>k</code>-mer.
 * @return the requested <code>k</code>-mer.
 */
 public GenomicSequence kmer(int k, int startIndex) {
 checkKmerParams(k, startIndex);
 
 StringBuilder sb = new StringBuilder(k);
 
 for (int i = 0; i < k; ++i) {
 sb.append(get(startIndex + i));
 }
 
 return new GenomicSequence(sb.toString());
 }
 
 @Override
 public int hashCode() {
 return Arrays.hashCode(data);
 }
 
 @Override
 public boolean equals(Object o) {
 if (o == null) {
 return false;
 }
 
 if (o == this) {
 return true;
 }
 
 if (!getClass().equals(o.getClass())) {
 return false;
 }
 
 GenomicSequence other = (GenomicSequence) o;
 
 if (other.length != length) {
 return false;
 }
 
 return Arrays.equals(data, other.data);
 }
 
 public int length() {
 return length;
 }
 
 private void checkKmerParams(int k, int startIndex) {
 if (k < 1) {
 String exceptionMessage = 
 String.format(
 "The k-parameter is too small (%d). Must be at least 1", 
 k);
 
 throw new IllegalArgumentException(exceptionMessage);
 }
 
 if (k > length) {
 String exceptionMessage = String.format("k(%d) > length(%d)", 
 k,
 length);
 
 throw new IllegalArgumentException(exceptionMessage);
 }
 
 if (startIndex + k > length) {
 String exceptionMessage = 
 String.format("startIndex(%d) + k(%d) = %d > length(%d)",
 startIndex,
 k,
 k + startIndex,
 length);
 
 throw new IllegalArgumentException(exceptionMessage);
 }
 }
 
 private static void checkNucleotideBase(char candidate) {
 switch (candidate) {
 case 'A', 'C', 'G', 'T' -> {
 return;
 }
 }
 
 String exceptionMessage = 
 String.format(
 "Invalid nucleotide base: %d\n", 
 candidate);
 
 throw new IllegalArgumentException(exceptionMessage);
 }
 
 private void writeToData(char nucleotideBase,
 int nucleotideIndex,
 int byteIndex) {
 
 int index = nucleotideIndex % NUCLEOTIDES_PER_BYTE;
 byte mask = encodeNucleotideName(nucleotideBase);
 mask <<= index * BITS_PER_CODE;
 data[byteIndex] |= mask;
 }
 
 private static byte encodeNucleotideName(char nucleotide) {
 switch (nucleotide) {
 case 'A' -> {
 return 0b00;
 }
 
 case 'C' -> {
 return 0b01;
 }
 
 case 'G' -> {
 return 0b10;
 }
 
 case 'T' -> {
 return 0b11;
 }
 }
 
 throw new IllegalStateException("Should not get here");
 }
}

io.github.coderodde.dna.kmerindex.DnaKmerIndex.java:

package io.github.coderodde.dna.kmerindex;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
/**
 * This class implements the <code>k</code>-mer index data structures that maps
 * each <code>k</code>-mer to the list of indices at which that very 
 * <code>k</code>-mer appears. The alphabet is restricted to the set of 
 * nucleotide bases <code>A, C, G, T</pre>.
 * 
 * @version 2.0.0 (Aug 18, 2025)
 * @since 1.0.0 (Aug 17, 2025)
 */
public final class DnaKmerIndex {
 
 // Package-private since DnaKmerIndexToStringConverter uses this:
 final Map<GenomicSequence, List<Integer>> index = new HashMap<>();
 
 public DnaKmerIndex(GenomicSequence sequence, int k) {
 checkArguments(sequence, k);
 
 for (int i = 0; i < sequence.length() - k + 1; ++i) {
 GenomicSequence kmer = sequence.kmer(k, i);
 
 if (!index.containsKey(kmer)) {
 index.put(kmer, new ArrayList<>());
 }
 
 index.get(kmer).add(i);
 }
 }
 
 public List<Integer> getListOfStartingIndices(GenomicSequence kmer) {
 return !index.containsKey(kmer) ? 
 List.of() : 
 Collections.unmodifiableList(index.get(kmer));
 }
 
 private void checkArguments(GenomicSequence sequence, int k) {
 Objects.requireNonNull(sequence, "The input GenomicSequence is null");
 
 if (sequence.length() == 0) {
 throw new IllegalArgumentException("Empty sequence");
 }
 
 if (k < 1) {
 String exceptionMessage = 
 String.format(
 "The k-parameter is too small (%d). Must be at least 1", 
 k);
 
 throw new IllegalArgumentException(exceptionMessage);
 }
 
 if (k > sequence.length()) {
 String exceptionMessage = 
 String.format("k(%d) > sequence.length(%d)", 
 k,
 sequence.length());
 
 throw new IllegalArgumentException(exceptionMessage);
 }
 }
}

io.github.coderodde.dna.kmerindex.DnaKmerIndexToStringConverter.java:

package io.github.coderodde.dna.kmerindex;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Objects;
/**
 * This class is responsible for converting a 
 * {@link io.github.coderodde.dna.kmerindex.DnaKmerIndex} to a neat string that
 * may be arbitrary or sorted.
 * 
 * @version 1.1.0 (Aug 18, 2025)
 * @since 1.0.0 (Aug 17, 2025)
 */
public final class DnaKmerIndexToStringConverter {
 
 private boolean sorted = true;
 private final DnaKmerIndex index;
 
 public DnaKmerIndexToStringConverter(DnaKmerIndex index, boolean sorted) {
 this.index = Objects.requireNonNull(index, "The input index is null");
 setSorted(sorted);
 }
 
 public DnaKmerIndexToStringConverter(DnaKmerIndex index) {
 this(index, true);
 }
 
 public void setSorted(boolean sorted) {
 this.sorted = sorted;
 }
 
 public boolean isSorted() {
 return sorted;
 }
 
 @Override
 public String toString() {
 StringBuilder sb = new StringBuilder();
 
 for (Map.Entry<GenomicSequence, List<Integer>> e :
 index.index.entrySet()) {
 
 sb.append(e.getKey())
 .append(" -> ")
 .append(e.getValue())
 .append('\n');
 }
 
 if (sb.length() > 0) {
 sb.setLength(sb.length() - 1);
 }
 
 if (sorted) {
 String[] lines = sb.toString().split("\n");
 Arrays.sort(lines);
 return String.join("\n", lines);
 }
 
 return sb.toString();
 }
}

io.github.coderodde.dna.kmerindex.RandomGenomicSequenceProvider.java:

package io.github.coderodde.dna.kmerindex;
import java.util.Random;
/**
 * This class provides a facility for computing a random genetic string.
 * 
 * @version 1.0.0 (Aug 16, 2025)
 * @since 1.0.0 (Aug 16, 2025)
 */
public class RandomGenomicSequenceProvider {
 
 private static final char[] NUCLEOTIDE_BASES = { 'A', 'C', 'G', 'T' };
 
 /**
 * Generates a random genomic string.
 * 
 * @param length the length of the genomic string.
 * @param random the random number generator.
 * @return a random genomic string.
 */
 public static String generate(int length, Random random) {
 StringBuilder sb = new StringBuilder(length);
 
 for (int i = 0; i < length; ++i) {
 int nucleotideIndex = random.nextInt(NUCLEOTIDE_BASES.length);
 sb.append(NUCLEOTIDE_BASES[nucleotideIndex]);
 }
 
 return sb.toString();
 }
 
 /**
 * Generates a random genomic string.
 * 
 * @param length the length of the genomic string.
 * @return a random genomic string.
 */
 public static String generate(int length) {
 return generate(length, new Random());
 }
 
 /**
 * Generates a random genomic string.
 * 
 * @param length the length of the genomic string.
 * @param seed the seed for the random number generator.
 * @return a random genomic string.
 */
 public static String generate(int length, long seed) {
 return generate(length, new Random(seed));
 }
}

io.github.coderodde.dna.kmerindex.demo.Demo.java:

package io.github.coderodde.dna.kmerindex.demo;
import io.github.coderodde.dna.kmerindex.DnaKmerIndex;
import io.github.coderodde.dna.kmerindex.DnaKmerIndexToStringConverter;
import io.github.coderodde.dna.kmerindex.GenomicSequence;
import io.github.coderodde.dna.kmerindex.RandomGenomicSequenceProvider;
/**
 * This class provides the demonstration program for the {@code k}-mer index.
 * 
 * @version 1.0.0 (Aug 16, 2025)
 * @since 1.0.0 (Aug 16, 2025)
 */
public final class Demo {
 
 private static final int LENGTH = 30;
 private static final int K = 2;
 
 public static void main(String[] args) {
 long seed = System.currentTimeMillis();
 
 System.out.println("seed = " + seed);
 
 String genomicString = RandomGenomicSequenceProvider.generate(LENGTH,
 seed);
 
 System.out.println("Genomic string: " + genomicString);
 
 DnaKmerIndex kmerIndex = 
 new DnaKmerIndex(new GenomicSequence(genomicString), K);
 
 System.out.println(new DnaKmerIndexToStringConverter(kmerIndex, false));
 }
}

io.github.coderodde.dna.kmerindex.DnaKmerIndexTest.java:

package io.github.coderodde.dna.kmerindex;
import java.util.Arrays;
import java.util.List;
import static org.junit.Assert.*;
import org.junit.Test;
public class DnaKmerIndexTest {
 
 @Test
 public void getListOfStartingIndices() {
 GenomicSequence gs = new GenomicSequence("ACGTAC");
 DnaKmerIndex index = new DnaKmerIndex(gs, 2);
 
 List<Integer> list = index.getListOfStartingIndices(gs.kmer(2, 0));
 
 assertEquals(Arrays.asList(0, 4), list);
 
 list = index.getListOfStartingIndices(gs.kmer(2, 1));
 
 assertEquals(Arrays.asList(1), list);
 
 list = index.getListOfStartingIndices(gs.kmer(2, 2));
 
 assertEquals(Arrays.asList(2), list);
 
 list = index.getListOfStartingIndices(new GenomicSequence("CC"));
 
 assertEquals(Arrays.asList(), list);
 }
}

io.github.coderodde.dna.kmerindex.GenomicSequenceTest.java:

package io.github.coderodde.dna.kmerindex;
import java.util.Random;
import static org.junit.Assert.assertEquals;
import org.junit.Test;
public class GenomicSequenceTest {
 
 @Test
 public void stressTestToString() {
 Random random = new Random(100L);
 
 for (int length = 1; length < 20; ++length) {
 for (int repeat = 0; repeat < 20; ++repeat) {
 String sequence = 
 RandomGenomicSequenceProvider.generate(length,
 random);
 
 GenomicSequence gs = new GenomicSequence(sequence);
 assertEquals(sequence, gs.toString());
 }
 }
 }
 
 @Test
 public void stressTestKmer() {
 Random random = new Random(101L);
 
 for (int length = 1; length < 20; ++length) {
 
 String sequenceString =
 RandomGenomicSequenceProvider.generate(length,
 random);
 
 GenomicSequence sequence = new GenomicSequence(sequenceString);
 
 for (int k = 1; k <= length; ++k) {
 
 for (int startIndex = 0; 
 startIndex < length - k + 1; 
 startIndex++) {
 String kmerString = getKmerString(sequenceString,
 k,
 startIndex);
 
 GenomicSequence kmerSequence = sequence.kmer(k, startIndex);
 assertEquals(kmerString, kmerSequence.toString());
 }
 }
 }
 }
 
 private static String getKmerString(String sequence, 
 int k, 
 int startIndex) {
 
 StringBuilder sb = new StringBuilder(k);
 
 for (int i = startIndex; i < startIndex + k; ++i) {
 sb.append(sequence.charAt(i));
 }
 
 return sb.toString();
 }
}

Typical demo output

seed = 1755516016804
Genomic string: TCGCTTCTCTCCCAACGTCCGGCGGGCAGG
CA -> [12, 26]
AC -> [14]
CC -> [10, 11, 18]
GC -> [2, 21, 25]
TC -> [0, 5, 7, 9, 17]
AG -> [27]
CG -> [1, 15, 19, 22]
GG -> [20, 23, 24, 28]
CT -> [3, 6, 8]
GT -> [16]
TT -> [4]
AA -> [13]

Critique request

Please, tell me anything about to improve my tiny library.

Stack Exchange Network

Funny time with DNA: a \$k\$-mer index data structure in Java, Take II

Intro

Code

Typical demo output

Critique request

0

Know someone who can answer? Share a link to this question via email, Twitter, or Facebook.

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Linked

Hot Network Questions

Funny time with DNA: a \$k\$-mer index data structure in Java, Take II

Intro

Code

Typical demo output

Critique request

0

Know someone who can answer? Share a link to this question via email, Twitter, or Facebook.

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Linked

Related

Hot Network Questions