Hello!
I’m a new user. While trying pijul to replace git for my personal projects, I was quite surprised that I hit some pretty strong performance issues.
My use cases were:
- a repo with a single 4MB file
- a repo with a few hundreds small files
In the first case commiting felt laggy, but not too awful. But when I try to diff a change (of just ~270 lines) it used up all my 8GB of RAM and crashed my system. With git it’s all instantaneous.
In the second case I noticed that while the repo itself was less than 20MB, the .pijul folder ran up to 200MB (whereas with git it’s a bit more than half the repo). Moreover committing and diffing was awfully slow and CPU intensive.
I prepared a simple python test to demonstrate the second issue:
#!/usr/bin/python3
"""This test shows the strong performance hit of large repos with pijul, compared to git."""
import glob
import json
import os
import random
import shutil
import shlex
import string
import subprocess
import time
def get_size(start_path):
"""Calculate the size of a directory."""
total_size = 0
for dirpath, dirnames, filenames in os.walk(start_path):
for f in filenames:
fp = os.path.join(dirpath, f)
total_size += os.path.getsize(fp)
return str(total_size / 1000000) + ' MB'
def random_string(length=25):
"""Generate a random string."""
pool = string.ascii_letters + string.digits
return ''.join(random.choice(pool) for i in range(length))
startdir = os.getcwd()
pjl = 'pijul-test-performance-pjl/'
git = 'pijul-test-performance-git/'
os.makedirs(pjl)
def build_value():
v = []
for i in range(20):
v.append(random_string())
return v
def build_obj():
return {random_string(): build_value(), random_string(): build_value(), random_string(): build_value()}
def build_data():
"""Build a list.
We use random strings to simulate real-life data and avoid compression optimizations on identical data."""
data = []
for i in range(20):
data.append(build_obj())
return data
print('Wait a few seconds, generating data...')
for i in range(400):
with open(pjl + str(i) + '.json', 'w') as f:
json.dump(build_data(), f, indent=2)
shutil.copytree(pjl, git)
print('Total size of files: ' + get_size(pjl))
os.chdir(pjl)
a = time.time()
subprocess.run(shlex.split('pijul init'), stdout=subprocess.DEVNULL)
subprocess.run(shlex.split('pijul add ' + ' '.join(glob.glob('*'))), stdout=subprocess.DEVNULL)
subprocess.run(shlex.split('pijul record -a -m first'), stdout=subprocess.DEVNULL)
b = time.time()
print('First PIJUL commit took', str((b - a)), 'seconds')
print('Repo size:', get_size('.pijul'))
# now we demonstrate that on subsequent operations, even on small commits, there's a huge performance hit
# also note that trying to diff a 4MB file with ~270 lines changed hanged my system and forced me to hard kill my user session, whereas in git there's no problem at all
with open('1.json') as f:
data = json.load(f)
data.append(random_string())
with open('1.json', 'w') as f:
json.dump(data, f, indent=2)
a = time.time()
subprocess.run(shlex.split('pijul record -a -m second'), stdout=subprocess.DEVNULL)
b = time.time()
print('Second PIJUL commit took', str((b - a)), 'seconds')
os.chdir(startdir)
os.chdir(git)
a = time.time()
subprocess.run(shlex.split('git init'), stdout=subprocess.DEVNULL)
subprocess.run(shlex.split('git add ' + ' '.join(glob.glob('*'))), stdout=subprocess.DEVNULL)
subprocess.run(shlex.split('git commit -m first'), stdout=subprocess.DEVNULL)
b = time.time()
print('First GIT commit took', str((b - a)), 'seconds')
print('Repo size:', get_size('.git'))
with open('1.json') as f:
data = json.load(f)
data.append(random_string())
with open('1.json', 'w') as f:
json.dump(data, f, indent=2)
a = time.time()
subprocess.run(shlex.split('git commit -am second'), stdout=subprocess.DEVNULL)
b = time.time()
print('Second GIT commit took', str((b - a)), 'seconds')
os.chdir(startdir)
I wonder wether this is just an implementation bug that will be likely solved when pijul hits 1.0, or if it comes from a design choice difficult to solve, as I hear darcs has awful performance too.