I want to create a large file(>200GB) and store it in Min.IO store. I deploy my attempts in a web app on a Kubernetes pod.
One attempt was with a modified Redeable stream and csv-writter library using the putObject method. Somethig like this:
const { faker } = require('@faker-js/faker');
const { createObjectCsvStringifier: createCsvStringifier } = require('csv-writer');
const Minio = require('minio');
const { Readable } = require('stream');
const minioClient = new Minio.Client({...});
const csvStringifier = createCsvStringifier({
header: [
{ id: 'userId', title: 'userId' },
{ id: 'username', title: 'username' },
.... ]});
const generateRandomRow = () => ({
userId: faker.database.mongodbObjectId(),
username: faker.person.firstName(),
...});
class csvGenerator extends Readable {
#count = 0;
#headerPushed = false;
#numRows;
constructor(numRows, options) {
super(options);
this.#numRows = numRows;
}
_read(size) {
if (!this.#headerPushed) {
this.push(csvStringifier.getHeaderString());
this.#headerPushed = true;
}
this.push(csvStringifier.stringifyRecords([generateRandomRow()]));
if (++this.#count === this.#numRows) {
this.push(null);
}
}
}
router.options('/BigFileCreation', cors());
router.post('/BigFileCreation', cors(), async (request, response) => {
const NUM_ROWS = parseInt(request.body.numberOfRows, 10);
const NAME_FILE = request.body.nameOfFile;
const BUCKET = request.body.bucket;
response.status(202).json({"Request status": "Reached"});
try {
const requestFile = await minioClient.putObject(BUCKET, NAME_FILE, new csvGenerator(NUM_ROWS, { highWaterMark: 1 }), null, metaData);
console.log(requestFile);
} catch (error) {
console.error(error);
response.status(500).json(error.toString());
}
});
This handles files less than 1GB with no issue, it takes less than 5 min. to create and upload, but when I request a 2 GB file or more my pod just stop, I guess I just get a OOMKilled status on my pod and thats why I don’t get any error msg on the logs.
I also test it with a temporary file on disk with the same csv-writter library and then stream it with fPutObject method on MinioSDK
const csvWriter = createCsvWriter({
path: 'StellarDB.csv',
header: [
{ id: 'userId', title: 'userId' },
{ id: 'username', title: 'username' },
{ id: 'lastName', title: 'lastName' },
{ id: 'email', title: 'Email' },
{ id: 'column', title: 'column' },
{ id: 'float', title: 'float' },
{ id: 'jobArea', title: 'jobArea' },
{ id: 'jobTitle', title: 'jobTitle' },
{ id: 'phone', title: 'phone' },
{ id: 'alpha', title: 'alpha' }
]
});
const writeLargeCsvFile = async (NUM_ROWS) => {
let batchSize = 500;
let batch = [];
for (let i = 0; i < NUM_ROWS; i++) {
batch.push(generateRandomRow());
if (batch.length === batchSize || i === NUM_ROWS - 1) {
await csvWriter.writeRecords(batch);
batch = [];
}
}
};
After more research I notice that probably the issue was on the library I was using for csv, so I changed to fast-csv and my final attempt was something like this:
const { format } = require('fast-csv');
async function generateAndUploadCSV(name, NUM_ROWS, bucketName) {
const pass = new PassThrough();
const uploadPromise = minioClient.putObject(bucketName, name, pass)
.catch(err => {
console.error('Error subiendo objeto:', err);
throw err;
});
const csvStream = format({ headers: [
'userId', 'username', 'lastName', 'email', 'column', 'float', 'jobArea', 'jobTitle', 'phone', 'alpha'
]});
csvStream.pipe(pass);
let i = 0;
function write() {
let ok = true;
while (i < NUM_ROWS && ok) {
i++;
const record = {
userId: i,
username: faker.person.firstName(),
lastName: faker.person.lastName(),
email: faker.internet.email(),
column: faker.database.column(),
float: faker.number.float(3),
jobArea: faker.person.jobArea(),
jobTitle: faker.person.jobTitle(),
phone: faker.phone.imei(),
alpha: faker.string.alpha({ length: { min: 5, max: 10 } }),
};
ok = csvStream.write(record);
if (i < NUM_ROWS) {
csvStream.once('drain', () => setImmediate(write));
} else {
csvStream.end();
}
}
csvStream.on('error', err => {
pass.destroy(err);
});
write();
const objInfo = await uploadPromise;
}
I also assign more resources on my pod (8GB memory and 4 cores). But all of them behave the same, just one file of 1GB and no more.
Also I modified my Dockerfile on the entrypoint with CMD ["node", "--max-old-space-size=6144","index.js"]
I research more about it and found there is this option to upload the file on pieces and the merge it back together when I’m going to use it. This could be useful for csv files but what if I want to use JSON files also.
Creating and storing the file is just my first step in testing tools that should handle large files without overflow issues. All of them run on Kubernetes pods.
Just adding more info my service pod is handled by Knative with a yml file similar to this:
apiVersion: serving.knative.dev/v1
kind: Service
metadata:
name: transformTesting
spec:
template:
spec:
containers:
- image: .../...:transform-testing-SNAPSHOT
env:
- name: FORCE_NEW_REVISION
value: "true"
Wish someone could point me towards a solution or a concept that I’m ignoring.