feat: Yield inside huge values migration serialization (#4197)

* feat: Yield inside huge values migration serialization With #4144 we break huge values slot migration into multiple commands. This PR now adds yield between those commands. It also adds a test that checks that modifying huge values while doing a migration works well, and that RSS doesn't grow too much. Fixes #4100
2025-05-10 18:05:44 +02:00 · 2025-01-05 16:28:45 +02:00 · 2025-01-05 16:28:45 +02:00 · 7860a169d9
commit 7860a169d9
parent ff4add0c9e
7 changed files with 148 additions and 16 deletions
--- a/tests/dragonfly/cluster_test.py
+++ b/tests/dragonfly/cluster_test.py
@ -14,8 +14,7 @@ from .replication_test import check_all_replicas_finished
 from redis.cluster import RedisCluster
 from redis.cluster import ClusterNode
 from .proxy import Proxy
-from .seeder import SeederBase
-from .seeder import StaticSeeder
+from .seeder import Seeder, SeederBase, StaticSeeder

 from . import dfly_args

@ -33,6 +32,11 @@ def monotonically_increasing_port_number():
 next_port = monotonically_increasing_port_number()


+async def get_memory(client, field):
+    info = await client.info("memory")
+    return info[field]
+
+
 class RedisClusterNode:
    def __init__(self, port):
        self.port = port
@ -1981,6 +1985,7 @@ async def test_cluster_migration_cancel(df_factory: DflyInstanceFactory):

@dfly_args({"proactor_threads": 2, "cluster_mode": "yes"})
@pytest.mark.asyncio
+@pytest.mark.opt_only
 async def test_cluster_migration_huge_container(df_factory: DflyInstanceFactory):
    instances = [
        df_factory.create(port=next(next_port), admin_port=next(next_port)) for i in range(2)
@ -1995,7 +2000,7 @@ async def test_cluster_migration_huge_container(df_factory: DflyInstanceFactory)

    logging.debug("Generating huge containers")
    seeder = StaticSeeder(
-        key_target=10,
+        key_target=100,
        data_size=10_000_000,
        collection_size=10_000,
        variance=1,
@ -2005,6 +2010,8 @@ async def test_cluster_migration_huge_container(df_factory: DflyInstanceFactory)
    await seeder.run(nodes[0].client)
    source_data = await StaticSeeder.capture(nodes[0].client)

+    mem_before = await get_memory(nodes[0].client, "used_memory_rss")
+
    nodes[0].migrations = [
        MigrationInfo("127.0.0.1", instances[1].admin_port, [(0, 16383)], nodes[1].id)
    ]
@ -2017,6 +2024,74 @@ async def test_cluster_migration_huge_container(df_factory: DflyInstanceFactory)
    target_data = await StaticSeeder.capture(nodes[1].client)
    assert source_data == target_data

+    # Get peak memory, because migration removes the data
+    mem_after = await get_memory(nodes[0].client, "used_memory_peak_rss")
+    logging.debug(f"Memory before {mem_before} after {mem_after}")
+    assert mem_after < mem_before * 1.1
+
+
+@dfly_args({"proactor_threads": 2, "cluster_mode": "yes"})
+@pytest.mark.parametrize("chunk_size", [1_000_000, 30])
+@pytest.mark.asyncio
+async def test_cluster_migration_while_seeding(
+    df_factory: DflyInstanceFactory, df_seeder_factory: DflySeederFactory, chunk_size
+):
+    instances = [
+        df_factory.create(
+            port=next(next_port),
+            admin_port=next(next_port),
+            serialization_max_chunk_size=chunk_size,
+        )
+        for _ in range(2)
+    ]
+    df_factory.start_all(instances)
+
+    nodes = [await create_node_info(instance) for instance in instances]
+    nodes[0].slots = [(0, 16383)]
+    nodes[1].slots = []
+    client0 = nodes[0].client
+    client1 = nodes[1].client
+
+    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])
+
+    logging.debug("Seeding cluster")
+    seeder = df_seeder_factory.create(
+        keys=10_000, port=instances[0].port, cluster_mode=True, mirror_to_fake_redis=True
+    )
+    await seeder.run(target_deviation=0.1)
+
+    seed = asyncio.create_task(seeder.run())
+    await asyncio.sleep(1)
+
+    nodes[0].migrations = [
+        MigrationInfo("127.0.0.1", instances[1].admin_port, [(0, 16383)], nodes[1].id)
+    ]
+    logging.debug("Migrating slots")
+    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])
+
+    logging.debug("Waiting for migration to finish")
+    await wait_for_status(nodes[0].admin_client, nodes[1].id, "FINISHED", timeout=300)
+    logging.debug("Migration finished")
+
+    logging.debug("Finalizing migration")
+    nodes[0].slots = []
+    nodes[1].slots = [(0, 16383)]
+    await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])
+
+    await asyncio.sleep(1)  # Let seeder feed dest before migration finishes
+
+    seeder.stop()
+    await seed
+    logging.debug("Seeding finished")
+
+    assert (
+        await get_memory(client0, "used_memory_peak_rss")
+        < await get_memory(client0, "used_memory_rss") * 1.1
+    )
+
+    capture = await seeder.capture_fake_redis()
+    assert await seeder.compare(capture, instances[1].port)
+

 def parse_lag(replication_info: str):
    lags = re.findall("lag=([0-9]+)\r\n", replication_info)
--- a/tests/dragonfly/requirements.txt
+++ b/tests/dragonfly/requirements.txt
@ -25,3 +25,4 @@ pytest-emoji==0.2.0
 pytest-icdiff==0.8
 pytest-timeout==2.2.0
 asyncio==3.4.3
+fakeredis[json]==2.26.2
--- a/tests/dragonfly/seeder/init.py
+++ b/tests/dragonfly/seeder/init.py
@ -177,14 +177,16 @@ class Seeder(SeederBase):
        ]

        sha = await client.script_load(Seeder._load_script("generate"))
-        await asyncio.gather(
-            *(self._run_unit(client, sha, unit, using_stopkey, args) for unit in self.units)
-        )
+        for unit in self.units:
+            # Must be serial, otherwise cluster clients throws an exception
+            await self._run_unit(client, sha, unit, using_stopkey, args)

    async def stop(self, client: aioredis.Redis):
        """Request seeder seeder if it's running without a target, future returned from start() must still be awaited"""

-        await asyncio.gather(*(client.set(unit.stop_key, "X") for unit in self.units))
+        for unit in self.units:
+            # Must be serial, otherwise cluster clients throws an exception
+            await client.set(unit.stop_key, "X")

    def change_key_target(self, target: int):
        """Change key target, applied only on succeeding runs"""
--- a/tests/dragonfly/seeder_test.py
+++ b/tests/dragonfly/seeder_test.py
@ -4,6 +4,8 @@ import string
 from redis import asyncio as aioredis
 from . import dfly_args
 from .seeder import Seeder, StaticSeeder
+from .instance import DflyInstanceFactory, DflyInstance
+from .utility import *


@dfly_args({"proactor_threads": 4})
@ -114,3 +116,22 @@ async def test_seeder_capture(async_client: aioredis.Redis):
    # Do another change
    await async_client.spop("set1")
    assert capture != await Seeder.capture(async_client)
+
+
+@pytest.mark.asyncio
+@dfly_args({"proactor_threads": 2})
+async def test_seeder_fake_redis(
+    df_factory: DflyInstanceFactory, df_seeder_factory: DflySeederFactory
+):
+    instance = df_factory.create()
+    df_factory.start_all([instance])
+
+    seeder = df_seeder_factory.create(
+        keys=100, port=instance.port, unsupported_types=[ValueType.JSON], mirror_to_fake_redis=True
+    )
+
+    await seeder.run(target_ops=5_000)
+
+    capture = await seeder.capture_fake_redis()
+
+    assert await seeder.compare(capture, instance.port)
--- a/tests/dragonfly/utility.py
+++ b/tests/dragonfly/utility.py
@ -14,6 +14,7 @@ import json
 import subprocess
 import pytest
 import os
+import fakeredis
 from typing import Iterable, Union
 from enum import Enum

@ -271,7 +272,7 @@ class CommandGenerator:
        ("LPUSH {k} {val}", ValueType.LIST),
        ("LPOP {k}", ValueType.LIST),
        ("SADD {k} {val}", ValueType.SET),
-        ("SPOP {k}", ValueType.SET),
+        # ("SPOP {k}", ValueType.SET),  # Disabled because it is inconsistent
        ("HSETNX {k} v0 {val}", ValueType.HSET),
        ("HINCRBY {k} v1 1", ValueType.HSET),
        ("ZPOPMIN {k} 1", ValueType.ZSET),
@ -423,6 +424,7 @@ class DflySeeder:
        unsupported_types=[],
        stop_on_failure=True,
        cluster_mode=False,
+        mirror_to_fake_redis=False,
    ):
        if cluster_mode:
            max_multikey = 1
@ -436,11 +438,16 @@ class DflySeeder:
        self.multi_transaction_probability = multi_transaction_probability
        self.stop_flag = False
        self.stop_on_failure = stop_on_failure
+        self.fake_redis = None

        self.log_file = log_file
        if self.log_file is not None:
            open(self.log_file, "w").close()

+        if mirror_to_fake_redis:
+            logging.debug("Creating FakeRedis instance")
+            self.fake_redis = fakeredis.FakeAsyncRedis()
+
    async def run(self, target_ops=None, target_deviation=None):
        """
        Run a seeding cycle on all dbs either until stop(), a fixed number of commands (target_ops)
@ -474,6 +481,14 @@ class DflySeeder:
        """Reset internal state. Needs to be called after flush or restart"""
        self.gen.reset()

+    async def capture_fake_redis(self):
+        keys = sorted(list(self.gen.keys_and_types()))
+        # TODO: support multiple databases
+        assert self.dbcount == 1
+        assert self.fake_redis != None
+        capture = DataCapture(await self._capture_entries(self.fake_redis, keys))
+        return [capture]
+
    async def capture(self, port=None):
        """Create DataCapture for all dbs"""

@ -588,12 +603,19 @@ class DflySeeder:
                queue.task_done()
                break

-            pipe = client.pipeline(transaction=tx_data[1])
-            for cmd in tx_data[0]:
-                pipe.execute_command(*cmd)
-
            try:
-                await pipe.execute()
+                if self.fake_redis is None:
+                    pipe = client.pipeline(transaction=tx_data[1])
+                    for cmd in tx_data[0]:
+                        pipe.execute_command(*cmd)
+                    await pipe.execute()
+                else:
+                    # To mirror consistently to Fake Redis we must only send to it successful
+                    # commands. We can't use pipes because they might succeed partially.
+                    for cmd in tx_data[0]:
+                        dfly_resp = await client.execute_command(*cmd)
+                        fake_resp = await self.fake_redis.execute_command(*cmd)
+                        assert dfly_resp == fake_resp
            except (redis.exceptions.ConnectionError, redis.exceptions.ResponseError) as e:
                if self.stop_on_failure:
                    await self._close_client(client)