Expose some functionality from `Vault` as interal methods (81f902f) - derivepassphrase.git

Expose some functionality from `Vault` as interal methods

Marco Ricci commited on 2024-06-22 21:19:30
Zeige 2 geänderte Dateien mit 161 Einfügungen und 24 Löschungen.

Expose some functionality from `derivepassphrase.Vault` as interal
methods, to facilitate testing and to avoid reimplementing the same
functionality again in the command-line interface.  This includes hash
length estimation and SSH key suitability checking.

src/derivepassphrase/__init__.py 0c1bd78..2f595c1
tests/test_passphrase_generating.py ab378c6..e668c26

src/derivepassphrase/__init__.py

Zeige Datei @ 81f902f

@@ -132,19 +132,70 @@ class Vault:
         for _ in range(len(self._required), self._length):
             self._required.append(bytes(self._allowed))
 
-    def _entropy_upper_bound(self) -> int:
+    def _entropy(self) -> float:
         """Estimate the passphrase entropy, given the current settings.
 
         The entropy is the base 2 logarithm of the amount of
-        possibilities.  We operate directly on the logarithms, and round
-        each summand up, overestimating the true entropy.
+        possibilities.  We operate directly on the logarithms, and use
+        sorting and [`math.fsum`][] to keep high accuracy.
+
+        Note:
+            We actually overestimate the entropy here because of poor
+            handling of character repetitions.  In the extreme, assuming
+            that only one character were allowed, then because there is
+            only one possible string of each given length, the entropy
+            of that string `s` is always be zero.  However, we calculate
+            the entropy as `math.log2(math.factorial(len(s)))`, i.e. we
+            assume the characters at the respective string position are
+            distinguishable from each other.
+
+        Returns:
+            A valid (and somewhat close) upper bound to the entropy.
 
         """
         factors: list[int] = []
+        if not self._required or any(not x for x in self._required):
+            return float('-inf')
         for i, charset in enumerate(self._required):
             factors.append(i + 1)
             factors.append(len(charset))
-        return sum(int(math.ceil(math.log2(f))) for f in factors)
+        factors.sort()
+        return math.fsum(math.log2(f) for f in factors)
+
+    def _estimate_sufficient_hash_length(
+        self, safety_factor: float = 2.0,
+    ) -> int:
+        """Estimate the sufficient hash length, given the current settings.
+
+        Using the entropy (via `_entropy`) and a safety factor, give an
+        initial estimate of the length to use for `create_hash` such
+        that using a `Sequin` with this hash will not exhaust it during
+        passphrase generation.
+
+        Args:
+            safety_factor: The safety factor.  Must be at least 1.
+
+        Returns:
+            The estimated sufficient hash length.
+
+        Warning:
+            This is a heuristic, not an exact computation; it may
+            underestimate the true necessary hash length.  It is
+            intended as a starting point for searching for a sufficient
+            hash length, usually by doubling the hash length each time
+            it does not yet prove so.
+
+        """
+        try:
+            safety_factor = float(safety_factor)
+        except TypeError as e:
+            raise TypeError(f'invalid safety factor: not a float: '
+                            f'{safety_factor!r}') from e
+        if not math.isfinite(safety_factor) or safety_factor < 1.0:
+            raise ValueError(f'invalid safety factor {safety_factor!r}')
+        # Ensure the bound is strictly positive.
+        entropy_bound = max(1, self._entropy())
+        return int(math.ceil(safety_factor * entropy_bound / 8))
 
     @classmethod
     def create_hash(
@@ -225,12 +276,8 @@ class Vault:
             b': 4TVH#5:aZl8LueOT\\{'
 
         """
-        entropy_bound = self._entropy_upper_bound()
-        # Use a safety factor, because a sequin will potentially throw
-        # bits away and we cannot rely on having generated a hash of
-        # exactly the right length.
-        safety_factor = 2
-        hash_length = int(math.ceil(safety_factor * entropy_bound / 8))
+        hash_length = self._estimate_sufficient_hash_length()
+        assert hash_length >= 1
         # Ensure the phrase is a bytes object.  Needed later for safe
         # concatenation.
         if isinstance(service_name, str):
@@ -267,11 +314,36 @@ class Vault:
                                                      charset)
                     pos = seq.generate(len(charset))
                     result.extend(charset[pos:pos+1])
-            except sequin.SequinExhaustedException:  # pragma: no cover
+            except sequin.SequinExhaustedException:
                 hash_length *= 2
             else:
                 return bytes(result)
 
+    @staticmethod
+    def _is_suitable_ssh_key(key: bytes | bytearray, /) -> bool:
+        """Check whether the key is suitable for passphrase derivation.
+
+        Currently, this only checks whether signatures with this key
+        type are deterministic.
+
+        Args:
+            key: SSH public key to check.
+
+        Returns:
+            True if and only if the key is suitable for use in deriving
+            a passphrase deterministically.
+
+        """
+        deterministic_signature_types = {
+            'ssh-ed25519':
+                lambda k: k.startswith(b'\x00\x00\x00\x0bssh-ed25519'),
+            'ssh-ed448':
+                lambda k: k.startswith(b'\x00\x00\x00\x09ssh-ed448'),
+            'ssh-rsa':
+                lambda k: k.startswith(b'\x00\x00\x00\x07ssh-rsa'),
+        }
+        return any(v(key) for v in deterministic_signature_types.values())
+
     @classmethod
     def phrase_from_signature(
         cls, key: bytes | bytearray, /
@@ -314,15 +386,7 @@ class Vault:
             True
 
         """
-        deterministic_signature_types = {
-            'ssh-ed25519':
-                lambda k: k.startswith(b'\x00\x00\x00\x0bssh-ed25519'),
-            'ssh-ed448':
-                lambda k: k.startswith(b'\x00\x00\x00\x09ssh-ed448'),
-            'ssh-rsa':
-                lambda k: k.startswith(b'\x00\x00\x00\x07ssh-rsa'),
-        }
-        if not any(v(key) for v in deterministic_signature_types.values()):
+        if not cls._is_suitable_ssh_key(key):
             raise ValueError(
                 'unsuitable SSH key: bad key, or signature not deterministic')
         with ssh_agent_client.SSHAgentClient() as client:

tests/test_passphrase_generating.py

Zeige Datei @ 81f902f

@@ -4,17 +4,22 @@
 
 """Test passphrase generation via derivepassphrase.Vault."""
 
-import pytest
+from __future__ import annotations
+
+import math
 
 import derivepassphrase
 import sequin
+import pytest
 
 Vault = derivepassphrase.Vault
 phrase = b'She cells C shells bye the sea shoars'
+google_phrase = rb': 4TVH#5:aZl8LueOT\{'
+twitter_phrase = rb"[ (HN_N:lI&<ro=)3'g9"
 
-@pytest.mark.parametrize('service,expected', [
-    (b'google', rb': 4TVH#5:aZl8LueOT\{'),
-    ('twitter', rb"[ (HN_N:lI&<ro=)3'g9"),
+@pytest.mark.parametrize(['service', 'expected'], [
+    (b'google', google_phrase),
+    ('twitter', twitter_phrase),
 ])
 def test_200_basic_configuration(service, expected):
     assert Vault(phrase=phrase).generate(service) == expected
@@ -122,3 +127,71 @@ def test_301_character_set_subtraction_duplicate():
         Vault._subtract(b'abcdef', b'aabbccddeeff')
     with pytest.raises(ValueError, match='duplicate characters'):
         Vault._subtract(b'aabbccddeeff', b'abcdef')
+
+@pytest.mark.parametrize(['length', 'settings', 'entropy'], [
+    (20, {}, math.log2(math.factorial(20)) + 20 * math.log2(94)),
+    (
+        20,
+        {'upper': 0, 'number': 0, 'space': 0, 'symbol': 0},
+        math.log2(math.factorial(20)) + 20 * math.log2(26)
+    ),
+    (0, {}, float('-inf')),
+    (0, {'lower': 0, 'number': 0, 'space': 0, 'symbol': 0}, float('-inf')),
+    (1, {}, math.log2(94)),
+    (1, {'upper': 0, 'lower': 0, 'number': 0, 'symbol': 0}, 0.0),
+])
+def test_400_entropy(
+    length: int, settings: dict[str, int], entropy: int
+) -> None:
+    v = Vault(length=length, **settings)
+    assert math.isclose(v._entropy(), entropy)
+    assert v._estimate_sufficient_hash_length() > 0
+    if math.isfinite(entropy) and entropy:
+        assert v._estimate_sufficient_hash_length(1.0) == math.ceil(entropy / 8)
+    assert v._estimate_sufficient_hash_length(8.0) >= entropy
+
+def test_401_hash_length_estimation(
+) -> None:
+    v = Vault(phrase=phrase)
+    with pytest.raises(ValueError,
+                       match='invalid safety factor'):
+        assert v._estimate_sufficient_hash_length(-1.0)
+    with pytest.raises(TypeError,
+                       match='invalid safety factor: not a float'):
+        assert v._estimate_sufficient_hash_length(None)  # type: ignore
+    v2 = Vault(phrase=phrase, lower=0, upper=0, number=0, symbol=0,
+               space=1, length=1)
+    assert v2._entropy() == 0.0
+    assert v2._estimate_sufficient_hash_length() > 0
+
+@pytest.mark.parametrize(['service', 'expected'], [
+    (b'google', google_phrase),
+    ('twitter', twitter_phrase),
+])
+def test_402_hash_length_expansion(
+    monkeypatch: Any, service: str | bytes, expected: bytes
+) -> None:
+    v = Vault(phrase=phrase)
+    monkeypatch.setattr(v,
+                        '_estimate_sufficient_hash_length',
+                        lambda *args, **kwargs: 1)
+    assert v._estimate_sufficient_hash_length
+    assert v.generate(service) == expected
+
+@pytest.mark.parametrize(['s', 'raises'], [
+    ('ñ', True), ('Düsseldorf', True),
+    ('liberté, egalité, fraternité', True), ('ASCII', False),
+    ('Düsseldorf'.encode('UTF-8'), False),
+    (bytearray([2, 3, 5, 7, 11, 13]), False),
+])
+def test_403_binary_strings(s: str | bytes | bytearray, raises: bool) -> None:
+    binstr = derivepassphrase.Vault._get_binary_string
+    if raises:
+        with pytest.raises(derivepassphrase.AmbiguousByteRepresentationError):
+            binstr(s)
+    elif isinstance(s, str):
+        assert binstr(s) == s.encode('UTF-8')
+        assert binstr(binstr(s)) == s.encode('UTF-8')
+    else:
+        assert binstr(s) == bytes(s)
+        assert binstr(binstr(s)) == bytes(s)


...	...	@@ -132,19 +132,70 @@ class Vault:
132	132	for _ in range(len(self._required), self._length):
133	133	self._required.append(bytes(self._allowed))
134	134
135		- def _entropy_upper_bound(self) -> int:
	135	+ def _entropy(self) -> float:
136	136	"""Estimate the passphrase entropy, given the current settings.
137	137
138	138	The entropy is the base 2 logarithm of the amount of
139		- possibilities. We operate directly on the logarithms, and round
140		- each summand up, overestimating the true entropy.
	139	+ possibilities. We operate directly on the logarithms, and use
	140	+ sorting and [`math.fsum`][] to keep high accuracy.
	141	+
	142	+ Note:
	143	+ We actually overestimate the entropy here because of poor
	144	+ handling of character repetitions. In the extreme, assuming
	145	+ that only one character were allowed, then because there is
	146	+ only one possible string of each given length, the entropy
	147	+ of that string `s` is always be zero. However, we calculate
	148	+ the entropy as `math.log2(math.factorial(len(s)))`, i.e. we
	149	+ assume the characters at the respective string position are
	150	+ distinguishable from each other.
	151	+
	152	+ Returns:
	153	+ A valid (and somewhat close) upper bound to the entropy.
141	154
142	155	"""
143	156	factors: list[int] = []
	157	+ if not self._required or any(not x for x in self._required):
	158	+ return float('-inf')
144	159	for i, charset in enumerate(self._required):
145	160	factors.append(i + 1)
146	161	factors.append(len(charset))
147		- return sum(int(math.ceil(math.log2(f))) for f in factors)
	162	+ factors.sort()
	163	+ return math.fsum(math.log2(f) for f in factors)
	164	+
	165	+ def _estimate_sufficient_hash_length(
	166	+ self, safety_factor: float = 2.0,
	167	+ ) -> int:
	168	+ """Estimate the sufficient hash length, given the current settings.
	169	+
	170	+ Using the entropy (via `_entropy`) and a safety factor, give an
	171	+ initial estimate of the length to use for `create_hash` such
	172	+ that using a `Sequin` with this hash will not exhaust it during
	173	+ passphrase generation.
	174	+
	175	+ Args:
	176	+ safety_factor: The safety factor. Must be at least 1.
	177	+
	178	+ Returns:
	179	+ The estimated sufficient hash length.
	180	+
	181	+ Warning:
	182	+ This is a heuristic, not an exact computation; it may
	183	+ underestimate the true necessary hash length. It is
	184	+ intended as a starting point for searching for a sufficient
	185	+ hash length, usually by doubling the hash length each time
	186	+ it does not yet prove so.
	187	+
	188	+ """
	189	+ try:
	190	+ safety_factor = float(safety_factor)
	191	+ except TypeError as e:
	192	+ raise TypeError(f'invalid safety factor: not a float: '
	193	+ f'{safety_factor!r}') from e
	194	+ if not math.isfinite(safety_factor) or safety_factor < 1.0:
	195	+ raise ValueError(f'invalid safety factor {safety_factor!r}')
	196	+ # Ensure the bound is strictly positive.
	197	+ entropy_bound = max(1, self._entropy())
	198	+ return int(math.ceil(safety_factor * entropy_bound / 8))
148	199
149	200	@classmethod
150	201	def create_hash(
...	...	@@ -225,12 +276,8 @@ class Vault:
225	276	b': 4TVH#5:aZl8LueOT\\{'
226	277
227	278	"""
228		- entropy_bound = self._entropy_upper_bound()
229		- # Use a safety factor, because a sequin will potentially throw
230		- # bits away and we cannot rely on having generated a hash of
231		- # exactly the right length.
232		- safety_factor = 2
233		- hash_length = int(math.ceil(safety_factor * entropy_bound / 8))
	279	+ hash_length = self._estimate_sufficient_hash_length()
	280	+ assert hash_length >= 1
234	281	# Ensure the phrase is a bytes object. Needed later for safe
235	282	# concatenation.
236	283	if isinstance(service_name, str):
...	...	@@ -267,11 +314,36 @@ class Vault:
267	314	charset)
268	315	pos = seq.generate(len(charset))
269	316	result.extend(charset[pos:pos+1])
270		- except sequin.SequinExhaustedException: # pragma: no cover
	317	+ except sequin.SequinExhaustedException:
271	318	hash_length *= 2
272	319	else:
273	320	return bytes(result)
274	321
	322	+ @staticmethod
	323	+ def _is_suitable_ssh_key(key: bytes \| bytearray, /) -> bool:
	324	+ """Check whether the key is suitable for passphrase derivation.
	325	+
	326	+ Currently, this only checks whether signatures with this key
	327	+ type are deterministic.
	328	+
	329	+ Args:
	330	+ key: SSH public key to check.
	331	+
	332	+ Returns:
	333	+ True if and only if the key is suitable for use in deriving
	334	+ a passphrase deterministically.
	335	+
	336	+ """
	337	+ deterministic_signature_types = {
	338	+ 'ssh-ed25519':
	339	+ lambda k: k.startswith(b'\x00\x00\x00\x0bssh-ed25519'),
	340	+ 'ssh-ed448':
	341	+ lambda k: k.startswith(b'\x00\x00\x00\x09ssh-ed448'),
	342	+ 'ssh-rsa':
	343	+ lambda k: k.startswith(b'\x00\x00\x00\x07ssh-rsa'),
	344	+ }
	345	+ return any(v(key) for v in deterministic_signature_types.values())
	346	+
275	347	@classmethod
276	348	def phrase_from_signature(
277	349	cls, key: bytes \| bytearray, /
...	...	@@ -314,15 +386,7 @@ class Vault:
314	386	True
315	387
316	388	"""
317		- deterministic_signature_types = {
318		- 'ssh-ed25519':
319		- lambda k: k.startswith(b'\x00\x00\x00\x0bssh-ed25519'),
320		- 'ssh-ed448':
321		- lambda k: k.startswith(b'\x00\x00\x00\x09ssh-ed448'),
322		- 'ssh-rsa':
323		- lambda k: k.startswith(b'\x00\x00\x00\x07ssh-rsa'),
324		- }
325		- if not any(v(key) for v in deterministic_signature_types.values()):
	389	+ if not cls._is_suitable_ssh_key(key):
326	390	raise ValueError(
327	391	'unsuitable SSH key: bad key, or signature not deterministic')
328	392	with ssh_agent_client.SSHAgentClient() as client:

...	...	@@ -4,17 +4,22 @@
4	4
5	5	"""Test passphrase generation via derivepassphrase.Vault."""
6	6
7		-import pytest
	7	+from __future__ import annotations
	8	+
	9	+import math
8	10
9	11	import derivepassphrase
10	12	import sequin
	13	+import pytest
11	14
12	15	Vault = derivepassphrase.Vault
13	16	phrase = b'She cells C shells bye the sea shoars'
	17	+google_phrase = rb': 4TVH#5:aZl8LueOT\{'
	18	+twitter_phrase = rb"[ (HN_N:lI&<ro=)3'g9"
14	19
15		-@pytest.mark.parametrize('service,expected', [
16		- (b'google', rb': 4TVH#5:aZl8LueOT\{'),
17		- ('twitter', rb"[ (HN_N:lI&<ro=)3'g9"),
	20	+@pytest.mark.parametrize(['service', 'expected'], [
	21	+ (b'google', google_phrase),
	22	+ ('twitter', twitter_phrase),
18	23	])
19	24	def test_200_basic_configuration(service, expected):
20	25	assert Vault(phrase=phrase).generate(service) == expected
...	...	@@ -122,3 +127,71 @@ def test_301_character_set_subtraction_duplicate():
122	127	Vault._subtract(b'abcdef', b'aabbccddeeff')
123	128	with pytest.raises(ValueError, match='duplicate characters'):
124	129	Vault._subtract(b'aabbccddeeff', b'abcdef')
	130	+
	131	+@pytest.mark.parametrize(['length', 'settings', 'entropy'], [
	132	+ (20, {}, math.log2(math.factorial(20)) + 20 * math.log2(94)),
	133	+ (
	134	+ 20,
	135	+ {'upper': 0, 'number': 0, 'space': 0, 'symbol': 0},
	136	+ math.log2(math.factorial(20)) + 20 * math.log2(26)
	137	+ ),
	138	+ (0, {}, float('-inf')),
	139	+ (0, {'lower': 0, 'number': 0, 'space': 0, 'symbol': 0}, float('-inf')),
	140	+ (1, {}, math.log2(94)),
	141	+ (1, {'upper': 0, 'lower': 0, 'number': 0, 'symbol': 0}, 0.0),
	142	+])
	143	+def test_400_entropy(
	144	+ length: int, settings: dict[str, int], entropy: int
	145	+) -> None:
	146	+ v = Vault(length=length, **settings)
	147	+ assert math.isclose(v._entropy(), entropy)
	148	+ assert v._estimate_sufficient_hash_length() > 0
	149	+ if math.isfinite(entropy) and entropy:
	150	+ assert v._estimate_sufficient_hash_length(1.0) == math.ceil(entropy / 8)
	151	+ assert v._estimate_sufficient_hash_length(8.0) >= entropy
	152	+
	153	+def test_401_hash_length_estimation(
	154	+) -> None:
	155	+ v = Vault(phrase=phrase)
	156	+ with pytest.raises(ValueError,
	157	+ match='invalid safety factor'):
	158	+ assert v._estimate_sufficient_hash_length(-1.0)
	159	+ with pytest.raises(TypeError,
	160	+ match='invalid safety factor: not a float'):
	161	+ assert v._estimate_sufficient_hash_length(None) # type: ignore
	162	+ v2 = Vault(phrase=phrase, lower=0, upper=0, number=0, symbol=0,
	163	+ space=1, length=1)
	164	+ assert v2._entropy() == 0.0
	165	+ assert v2._estimate_sufficient_hash_length() > 0
	166	+
	167	+@pytest.mark.parametrize(['service', 'expected'], [
	168	+ (b'google', google_phrase),
	169	+ ('twitter', twitter_phrase),
	170	+])
	171	+def test_402_hash_length_expansion(
	172	+ monkeypatch: Any, service: str \| bytes, expected: bytes
	173	+) -> None:
	174	+ v = Vault(phrase=phrase)
	175	+ monkeypatch.setattr(v,
	176	+ '_estimate_sufficient_hash_length',
	177	+ lambda args, *kwargs: 1)
	178	+ assert v._estimate_sufficient_hash_length
	179	+ assert v.generate(service) == expected
	180	+
	181	+@pytest.mark.parametrize(['s', 'raises'], [
	182	+ ('ñ', True), ('Düsseldorf', True),
	183	+ ('liberté, egalité, fraternité', True), ('ASCII', False),
	184	+ ('Düsseldorf'.encode('UTF-8'), False),
	185	+ (bytearray([2, 3, 5, 7, 11, 13]), False),
	186	+])
	187	+def test_403_binary_strings(s: str \| bytes \| bytearray, raises: bool) -> None:
	188	+ binstr = derivepassphrase.Vault._get_binary_string
	189	+ if raises:
	190	+ with pytest.raises(derivepassphrase.AmbiguousByteRepresentationError):
	191	+ binstr(s)
	192	+ elif isinstance(s, str):
	193	+ assert binstr(s) == s.encode('UTF-8')
	194	+ assert binstr(binstr(s)) == s.encode('UTF-8')
	195	+ else:
	196	+ assert binstr(s) == bytes(s)
	197	+ assert binstr(binstr(s)) == bytes(s)
125	198