|
| 1 | +from typing import List |
| 2 | + |
| 3 | + |
| 4 | +class Solution: |
| 5 | + """Base class for all LeetCode Problems.""" |
| 6 | + |
| 7 | + def validUtf8(self, data: List[int]) -> bool: |
| 8 | + """ |
| 9 | + Given an integer array data representing the data, return whether it is a valid |
| 10 | + UTF-8 encoding (i.e. it translates to a sequence of valid UTF-8 encoded |
| 11 | + characters). |
| 12 | + |
| 13 | + A character in UTF8 can be from 1 to 4 bytes long, subjected to the following |
| 14 | + rules: |
| 15 | + - For a 1-byte character, the first bit is a 0, followed by its Unicode code. |
| 16 | + - For an n-bytes character, the first n bits are all one's, the n + 1 bit is 0, |
| 17 | + followed by n - 1 bytes with the most significant 2 bits being 10. |
| 18 | + |
| 19 | + This is how the UTF-8 encoding would work: |
| 20 | + |
| 21 | + Number of Bytes | UTF-8 Octet Sequence |
| 22 | + | (binary) |
| 23 | + --------------------+----------------------------------------- |
| 24 | + 1 | 0xxxxxxx |
| 25 | + 2 | 110xxxxx 10xxxxxx |
| 26 | + 3 | 1110xxxx 10xxxxxx 10xxxxxx |
| 27 | + 4 | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
| 28 | + |
| 29 | + x denotes a bit in the binary form of a byte that may be either 0 or 1. |
| 30 | + |
| 31 | + Note: The input is an array of integers. Only the least significant 8 bits of |
| 32 | + each integer is used to store the data. This means each integer represents only |
| 33 | + 1 byte of data. |
| 34 | + """ |
| 35 | + i = 0 |
| 36 | + while i < len(data): |
| 37 | + character = data[i] |
| 38 | + |
| 39 | + # Check if 1-byte character |
| 40 | + msb = (character & (2**7)) >> 7 |
| 41 | + if msb == 0: |
| 42 | + i += 1 |
| 43 | + continue |
| 44 | + |
| 45 | + # Find the first zero |
| 46 | + n = 0 |
| 47 | + for k in range(7, 2, -1): |
| 48 | + if (character & (2**k)) >> k == 0: |
| 49 | + break |
| 50 | + n += 1 |
| 51 | + |
| 52 | + # Check if character is at least 2 bytes long |
| 53 | + if n <= 1: |
| 54 | + return False |
| 55 | + |
| 56 | + # Check if character is at most 4 bytes long |
| 57 | + if n >= 5: |
| 58 | + return False |
| 59 | + |
| 60 | + # Check if lengths matches |
| 61 | + if i + n > len(data): |
| 62 | + return False |
| 63 | + |
| 64 | + # Check if n-1 bytes are correct |
| 65 | + j = 1 |
| 66 | + while j < n: |
| 67 | + next_character = data[i + j] |
| 68 | + # Check msb is 1 |
| 69 | + if (next_character & (2**7)) >> 7 != 1: |
| 70 | + return False |
| 71 | + # Check second msb is 0 |
| 72 | + if (next_character & (2**6)) >> 6 != 0: |
| 73 | + return False |
| 74 | + j += 1 |
| 75 | + i += n |
| 76 | + return True |
0 commit comments