I’m trying to develop a system to read the handwriting in a chart within a written page, using a multimodal LLM. I’m using Google apps script So far I’ve experimented:
function openRouterApiRequest() {
var apiKey = "****";
var imageUrl = "https://www.dropbox.com/scl/fi/xxeyyiqs74rip0319080a/output-onlinepngtools.png?rlkey=******&e=2&st=w21t3k3j&raw=1";
var apiEndpoint = 'https://openrouter.ai/api/v1/chat/completions';
var payload = {
"model": "meta-llama/llama-4-maverick",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
// "text": "can you reproduce the chart in the middle of the image?"
"text": " In the role of an OCR program can you convert the entire page from text and handwriting into text? Please note that in the chart in the middle of page, the 4th column has options of : S or M or C only. Also the only options for the 5th Column are 'up pointing arrow', 'down pointing arrow' or '='. The 6th column has only Y or N as options"
},
{
"type": "image_url",
"image_url": {
"url": imageUrl
}
}
]
}
]
};
var options = {
'method': 'post',
'headers': {
'Authorization': 'Bearer ' + apiKey,
'Content-Type': 'application/json'
},
'payload': JSON.stringify(payload),
'muteHttpExceptions': true // To get the response even if the request fails
};
var response = UrlFetchApp.fetch(apiEndpoint, options);
var responseCode = response.getResponseCode();
var responseBody = response.getContentText();
// Handle the response as needed
Logger.log('Response Code: ' + responseCode);
Logger.log('Response Body: ' + responseBody);
// You might want to parse the responseBody if it's JSON
try {
var jsonResponse = JSON.parse(responseBody);
Logger.log(jsonResponse);
} catch (e) {
Logger.log('Failed to parse response as JSON: ' + e.message);
}
}
Only about half of the of the cells are accurately detected in my current setup. I’m wondering about preprocessing the images to improve handwriting contrast. I tried manually increasing the contrast with https://onlinepngtools.com/change-png-quality and saving the image. However after doing this I don’t see a significant improvement in character recognition. How can I improve the contrast further using an image processing library to improve recognition?




