2
\$\begingroup\$

(See the previous and initial iteration.)

Intro

This time, I decided to pack the genomic data such that 4 nucleotide bases are encoded into a single byte. In other words, A is mapped to binary 00, C to 01, G to 10, and T to 11. For example, the genomic string CATG will be encoded as 10110001.

Code

io.github.coderodde.dna.kmerindex.GenomicSequence.java:

package io.github.coderodde.dna.kmerindex;
import java.util.Arrays;
import java.util.Objects;
/**
 * This class implements a tightly packed genomic sequence over the nucleotide
 * base alphabet <code>A, C, G, T</code>. Each byte, contains 4 nucleotide
 * bases.
 * 
 * @version 2.0.0 (Aug 18, 2025)
 * @since 1.0.0 (Aug 17, 2025)
 */
public final class GenomicSequence {
 
 private static final int NUCLEOTIDES_PER_BYTE = 4;
 private static final int BITS_PER_CODE = 2;
 
 private final byte[] data;
 private final int length;
 
 public GenomicSequence(String sequence) {
 Objects.requireNonNull(sequence, "The input sequence is null");
 data = new byte[sequence.length() / NUCLEOTIDES_PER_BYTE +
 (sequence.length() % NUCLEOTIDES_PER_BYTE != 0 ? 1 : 0)];
 
 length = sequence.length();
 
 for (int i = 0; i < sequence.length(); ++i) {
 char nucleotideBase = sequence.charAt(i);
 checkNucleotideBase(nucleotideBase);
 
 writeToData(nucleotideBase,
 i % NUCLEOTIDES_PER_BYTE,
 i / NUCLEOTIDES_PER_BYTE);
 }
 }
 
 public char get(int nucleotideIndex) {
 if (nucleotideIndex < 0) {
 throw new IndexOutOfBoundsException(
 String.format(
 "nucleotideIndex(%d) < 0",
 nucleotideIndex));
 }
 
 if (nucleotideIndex >= length) {
 throw new IndexOutOfBoundsException(
 String.format(
 "nucleotideIndex(%d) >= length(%d)\n", 
 nucleotideIndex, 
 length));
 }
 
 int byteIndex = nucleotideIndex / NUCLEOTIDES_PER_BYTE;
 nucleotideIndex %= NUCLEOTIDES_PER_BYTE;
 
 byte datum = data[byteIndex];
 datum >>>= nucleotideIndex * BITS_PER_CODE;
 datum &= 0b11;
 
 switch (datum) {
 case 0b00 -> {
 return 'A';
 }
 
 case 0b01 -> {
 return 'C';
 }
 
 case 0b10 -> {
 return 'G';
 }
 
 case 0b11 -> {
 return 'T'; 
 }
 }
 
 throw new IllegalStateException("Should not get here");
 }
 
 @Override
 public String toString() {
 StringBuilder sb = new StringBuilder(length);
 
 for (int i = 0; i < length; ++i) {
 sb.append(get(i));
 }
 
 return sb.toString();
 }
 
 /**
 * Extracts a kmer of length {@code k} starting from index 
 * {@code startIndex}.
 * 
 * @param k the length of the requested <code>k</code>-mer.
 * @param startIndex the starting index of the requested <code>k</code>-mer.
 * @return the requested <code>k</code>-mer.
 */
 public GenomicSequence kmer(int k, int startIndex) {
 checkKmerParams(k, startIndex);
 
 StringBuilder sb = new StringBuilder(k);
 
 for (int i = 0; i < k; ++i) {
 sb.append(get(startIndex + i));
 }
 
 return new GenomicSequence(sb.toString());
 }
 
 @Override
 public int hashCode() {
 return Arrays.hashCode(data);
 }
 
 @Override
 public boolean equals(Object o) {
 if (o == null) {
 return false;
 }
 
 if (o == this) {
 return true;
 }
 
 if (!getClass().equals(o.getClass())) {
 return false;
 }
 
 GenomicSequence other = (GenomicSequence) o;
 
 if (other.length != length) {
 return false;
 }
 
 return Arrays.equals(data, other.data);
 }
 
 public int length() {
 return length;
 }
 
 private void checkKmerParams(int k, int startIndex) {
 if (k < 1) {
 String exceptionMessage = 
 String.format(
 "The k-parameter is too small (%d). Must be at least 1", 
 k);
 
 throw new IllegalArgumentException(exceptionMessage);
 }
 
 if (k > length) {
 String exceptionMessage = String.format("k(%d) > length(%d)", 
 k,
 length);
 
 throw new IllegalArgumentException(exceptionMessage);
 }
 
 if (startIndex + k > length) {
 String exceptionMessage = 
 String.format("startIndex(%d) + k(%d) = %d > length(%d)",
 startIndex,
 k,
 k + startIndex,
 length);
 
 throw new IllegalArgumentException(exceptionMessage);
 }
 }
 
 private static void checkNucleotideBase(char candidate) {
 switch (candidate) {
 case 'A', 'C', 'G', 'T' -> {
 return;
 }
 }
 
 String exceptionMessage = 
 String.format(
 "Invalid nucleotide base: %d\n", 
 candidate);
 
 throw new IllegalArgumentException(exceptionMessage);
 }
 
 private void writeToData(char nucleotideBase,
 int nucleotideIndex,
 int byteIndex) {
 
 int index = nucleotideIndex % NUCLEOTIDES_PER_BYTE;
 byte mask = encodeNucleotideName(nucleotideBase);
 mask <<= index * BITS_PER_CODE;
 data[byteIndex] |= mask;
 }
 
 private static byte encodeNucleotideName(char nucleotide) {
 switch (nucleotide) {
 case 'A' -> {
 return 0b00;
 }
 
 case 'C' -> {
 return 0b01;
 }
 
 case 'G' -> {
 return 0b10;
 }
 
 case 'T' -> {
 return 0b11;
 }
 }
 
 throw new IllegalStateException("Should not get here");
 }
}

io.github.coderodde.dna.kmerindex.DnaKmerIndex.java:

package io.github.coderodde.dna.kmerindex;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
/**
 * This class implements the <code>k</code>-mer index data structures that maps
 * each <code>k</code>-mer to the list of indices at which that very 
 * <code>k</code>-mer appears. The alphabet is restricted to the set of 
 * nucleotide bases <code>A, C, G, T</pre>.
 * 
 * @version 2.0.0 (Aug 18, 2025)
 * @since 1.0.0 (Aug 17, 2025)
 */
public final class DnaKmerIndex {
 
 // Package-private since DnaKmerIndexToStringConverter uses this:
 final Map<GenomicSequence, List<Integer>> index = new HashMap<>();
 
 public DnaKmerIndex(GenomicSequence sequence, int k) {
 checkArguments(sequence, k);
 
 for (int i = 0; i < sequence.length() - k + 1; ++i) {
 GenomicSequence kmer = sequence.kmer(k, i);
 
 if (!index.containsKey(kmer)) {
 index.put(kmer, new ArrayList<>());
 }
 
 index.get(kmer).add(i);
 }
 }
 
 public List<Integer> getListOfStartingIndices(GenomicSequence kmer) {
 return !index.containsKey(kmer) ? 
 List.of() : 
 Collections.unmodifiableList(index.get(kmer));
 }
 
 private void checkArguments(GenomicSequence sequence, int k) {
 Objects.requireNonNull(sequence, "The input GenomicSequence is null");
 
 if (sequence.length() == 0) {
 throw new IllegalArgumentException("Empty sequence");
 }
 
 if (k < 1) {
 String exceptionMessage = 
 String.format(
 "The k-parameter is too small (%d). Must be at least 1", 
 k);
 
 throw new IllegalArgumentException(exceptionMessage);
 }
 
 if (k > sequence.length()) {
 String exceptionMessage = 
 String.format("k(%d) > sequence.length(%d)", 
 k,
 sequence.length());
 
 throw new IllegalArgumentException(exceptionMessage);
 }
 }
}

io.github.coderodde.dna.kmerindex.DnaKmerIndexToStringConverter.java:

package io.github.coderodde.dna.kmerindex;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Objects;
/**
 * This class is responsible for converting a 
 * {@link io.github.coderodde.dna.kmerindex.DnaKmerIndex} to a neat string that
 * may be arbitrary or sorted.
 * 
 * @version 1.1.0 (Aug 18, 2025)
 * @since 1.0.0 (Aug 17, 2025)
 */
public final class DnaKmerIndexToStringConverter {
 
 private boolean sorted = true;
 private final DnaKmerIndex index;
 
 public DnaKmerIndexToStringConverter(DnaKmerIndex index, boolean sorted) {
 this.index = Objects.requireNonNull(index, "The input index is null");
 setSorted(sorted);
 }
 
 public DnaKmerIndexToStringConverter(DnaKmerIndex index) {
 this(index, true);
 }
 
 public void setSorted(boolean sorted) {
 this.sorted = sorted;
 }
 
 public boolean isSorted() {
 return sorted;
 }
 
 @Override
 public String toString() {
 StringBuilder sb = new StringBuilder();
 
 for (Map.Entry<GenomicSequence, List<Integer>> e :
 index.index.entrySet()) {
 
 sb.append(e.getKey())
 .append(" -> ")
 .append(e.getValue())
 .append('\n');
 }
 
 if (sb.length() > 0) {
 sb.setLength(sb.length() - 1);
 }
 
 if (sorted) {
 String[] lines = sb.toString().split("\n");
 Arrays.sort(lines);
 return String.join("\n", lines);
 }
 
 return sb.toString();
 }
}

io.github.coderodde.dna.kmerindex.RandomGenomicSequenceProvider.java:

package io.github.coderodde.dna.kmerindex;
import java.util.Random;
/**
 * This class provides a facility for computing a random genetic string.
 * 
 * @version 1.0.0 (Aug 16, 2025)
 * @since 1.0.0 (Aug 16, 2025)
 */
public class RandomGenomicSequenceProvider {
 
 private static final char[] NUCLEOTIDE_BASES = { 'A', 'C', 'G', 'T' };
 
 /**
 * Generates a random genomic string.
 * 
 * @param length the length of the genomic string.
 * @param random the random number generator.
 * @return a random genomic string.
 */
 public static String generate(int length, Random random) {
 StringBuilder sb = new StringBuilder(length);
 
 for (int i = 0; i < length; ++i) {
 int nucleotideIndex = random.nextInt(NUCLEOTIDE_BASES.length);
 sb.append(NUCLEOTIDE_BASES[nucleotideIndex]);
 }
 
 return sb.toString();
 }
 
 /**
 * Generates a random genomic string.
 * 
 * @param length the length of the genomic string.
 * @return a random genomic string.
 */
 public static String generate(int length) {
 return generate(length, new Random());
 }
 
 /**
 * Generates a random genomic string.
 * 
 * @param length the length of the genomic string.
 * @param seed the seed for the random number generator.
 * @return a random genomic string.
 */
 public static String generate(int length, long seed) {
 return generate(length, new Random(seed));
 }
}

io.github.coderodde.dna.kmerindex.demo.Demo.java:

package io.github.coderodde.dna.kmerindex.demo;
import io.github.coderodde.dna.kmerindex.DnaKmerIndex;
import io.github.coderodde.dna.kmerindex.DnaKmerIndexToStringConverter;
import io.github.coderodde.dna.kmerindex.GenomicSequence;
import io.github.coderodde.dna.kmerindex.RandomGenomicSequenceProvider;
/**
 * This class provides the demonstration program for the {@code k}-mer index.
 * 
 * @version 1.0.0 (Aug 16, 2025)
 * @since 1.0.0 (Aug 16, 2025)
 */
public final class Demo {
 
 private static final int LENGTH = 30;
 private static final int K = 2;
 
 public static void main(String[] args) {
 long seed = System.currentTimeMillis();
 
 System.out.println("seed = " + seed);
 
 String genomicString = RandomGenomicSequenceProvider.generate(LENGTH,
 seed);
 
 System.out.println("Genomic string: " + genomicString);
 
 DnaKmerIndex kmerIndex = 
 new DnaKmerIndex(new GenomicSequence(genomicString), K);
 
 System.out.println(new DnaKmerIndexToStringConverter(kmerIndex, false));
 }
}

io.github.coderodde.dna.kmerindex.DnaKmerIndexTest.java:

package io.github.coderodde.dna.kmerindex;
import java.util.Arrays;
import java.util.List;
import static org.junit.Assert.*;
import org.junit.Test;
public class DnaKmerIndexTest {
 
 @Test
 public void getListOfStartingIndices() {
 GenomicSequence gs = new GenomicSequence("ACGTAC");
 DnaKmerIndex index = new DnaKmerIndex(gs, 2);
 
 List<Integer> list = index.getListOfStartingIndices(gs.kmer(2, 0));
 
 assertEquals(Arrays.asList(0, 4), list);
 
 list = index.getListOfStartingIndices(gs.kmer(2, 1));
 
 assertEquals(Arrays.asList(1), list);
 
 list = index.getListOfStartingIndices(gs.kmer(2, 2));
 
 assertEquals(Arrays.asList(2), list);
 
 list = index.getListOfStartingIndices(new GenomicSequence("CC"));
 
 assertEquals(Arrays.asList(), list);
 }
}

io.github.coderodde.dna.kmerindex.GenomicSequenceTest.java:

package io.github.coderodde.dna.kmerindex;
import java.util.Random;
import static org.junit.Assert.assertEquals;
import org.junit.Test;
public class GenomicSequenceTest {
 
 @Test
 public void stressTestToString() {
 Random random = new Random(100L);
 
 for (int length = 1; length < 20; ++length) {
 for (int repeat = 0; repeat < 20; ++repeat) {
 String sequence = 
 RandomGenomicSequenceProvider.generate(length,
 random);
 
 GenomicSequence gs = new GenomicSequence(sequence);
 assertEquals(sequence, gs.toString());
 }
 }
 }
 
 @Test
 public void stressTestKmer() {
 Random random = new Random(101L);
 
 for (int length = 1; length < 20; ++length) {
 
 String sequenceString =
 RandomGenomicSequenceProvider.generate(length,
 random);
 
 GenomicSequence sequence = new GenomicSequence(sequenceString);
 
 for (int k = 1; k <= length; ++k) {
 
 for (int startIndex = 0; 
 startIndex < length - k + 1; 
 startIndex++) {
 String kmerString = getKmerString(sequenceString,
 k,
 startIndex);
 
 GenomicSequence kmerSequence = sequence.kmer(k, startIndex);
 assertEquals(kmerString, kmerSequence.toString());
 }
 }
 }
 }
 
 private static String getKmerString(String sequence, 
 int k, 
 int startIndex) {
 
 StringBuilder sb = new StringBuilder(k);
 
 for (int i = startIndex; i < startIndex + k; ++i) {
 sb.append(sequence.charAt(i));
 }
 
 return sb.toString();
 }
}

Typical demo output

seed = 1755516016804
Genomic string: TCGCTTCTCTCCCAACGTCCGGCGGGCAGG
CA -> [12, 26]
AC -> [14]
CC -> [10, 11, 18]
GC -> [2, 21, 25]
TC -> [0, 5, 7, 9, 17]
AG -> [27]
CG -> [1, 15, 19, 22]
GG -> [20, 23, 24, 28]
CT -> [3, 6, 8]
GT -> [16]
TT -> [4]
AA -> [13]

Critique request

Please, tell me anything about to improve my tiny library.

asked Aug 18 at 11:22
\$\endgroup\$

0

Know someone who can answer? Share a link to this question via email, Twitter, or Facebook.

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.