Increase the number of messages read in a timeframe using google API - gmail-api

My application needs to read the message id and subject of all the messages in Gmail using Google REST API. I understand that I need to make 2 calls. First call retrieves all the message ids. Then I try to retrieve the subject of the message ids in batch.
Currently I am able to read 3000 messages in 90 seconds (1min and 30 seconds). I want to be able to read more messages in the same timeframe.
Please suggest on how this can be done.
Note: I am already using gzip and partial response. Any other suggestions would be helpful.

The Gmail API has a couple of limitations when it comes to usage:
Daily Usage: 1,000,000,000 quota units per day
Per User Rate Limit: 250 quota units per user per second, moving average (allows short bursts)
Listing messages costs 5 quota units, and getting an individual message also costs 5 quota units.
Listing allows us to get 100 message ids in one API call. This gives us 250 / 5 = 50 listing calls per second => 50 * 100 = 5000 message ids per second. Just listing ~3000 messages in quick succession should not make you hit a brick wall, as far as the quota goes.
Ignoring the bursting capabilities of the quota, getting messages can be done 250 / 5 = 50 messages a second. This would in theory allow us to get 3000 messages in 3000 / 50 = 60 seconds, landing at a little bit more than a minute total.
I have never tried something this quota-intensive before, so I wrote a little experiment for fun using Nodejs, to test how hard you can burst the quota. I made sure to only ask for partial data, and I used batch requests:
var rp = require('request-promise');
var googleUtils = require('google-api-batch-utils');
var createBatchBody = googleUtils.createBatchBody;
var parseBatchResponse = googleUtils.parseBatchResponse;
var _ = require('lodash');
// Boundary used in the batch request, to reduce the number of http requests
// when getting the subject of the messages.
var BOUNDARY = 'example_boundary';
// If the quota should be exceeded, how long should we wait to try again?
var TIMEOUT = 1000;
// Playground access token authorized with the Gmail scopes:
// https://developers.google.com/oauthplayground/
var ACCESS_TOKEN = '{API_KEY}';
function listAllMessageIds() {
var resultingIds = [];
return (function listMessageIds(pageToken) {
return rp({
uri: 'https://www.googleapis.com/gmail/v1/users/me/messages',
qs: {
access_token: ACCESS_TOKEN,
pageToken: pageToken,
fields: 'messages(id),nextPageToken'
},
json: true
}).then(function(response) {
var messages = response.messages;
var nextPageToken = response.nextPageToken;
if (messages) {
resultingIds = resultingIds.concat(_.pluck(messages, 'id'));
}
if (nextPageToken) {
return listMessageIds(nextPageToken);
} else {
return resultingIds;
}
});
})(null);
}
function getSubjectOfAllMessages(messageIds) {
var resultingSubjectIdObjects = [];
var uris = messageIds.map(function(id) {
return {
uri: '/gmail/v1/users/me/messages/' + id,
qs: {
fields: 'id,payload/headers',
format: 'metadata',
metadataHeaders: 'subject'
}
};
});
var idChunks = _.chunk(uris, 100);
return (function getSubjectOfChunk(chunk) {
if (!chunk) {
return resultingSubjectIdObjects;
}
var batchBody = createBatchBody(chunk, BOUNDARY);
return rp({
method: 'POST',
uri: 'https://www.googleapis.com/batch',
headers: {
Authorization: 'Bearer ' + ACCESS_TOKEN,
'Content-Type': 'multipart/mixed; boundary="' + BOUNDARY + '"'
},
body: batchBody
})
.then(parseBatchResponse)
.then(function(messages) {
resultingSubjectIdObjects =
resultingSubjectIdObjects.concat(messages.map(function (m) {
return {id: m.id, subject: _.get(m, 'payload.headers[0].value') || ''};
}));
return getSubjectOfChunk(idChunks.shift());
})
.catch(function(error) {
return new Promise(function(resolve, reject) {
setTimeout(function() {
resolve(getSubjectOfChunk(chunk));
}, TIMEOUT);
});
});
})(idChunks.shift());
}
console.time(1);
listAllMessageIds().then(getSubjectOfAllMessages).then(function(result) {
console.log(result.length + ' messages where fetched in ');
console.timeEnd(1);
}).catch(console.error.bind(console));
// => 7534 messages where fetched in 63277ms
With partial responses and batch requests, I could fetch ~7500 messages without trouble in 63 seconds.

Besides batch, there's also using multiple threads. Also, if you only need subject headers make sure you're using message.get(format=METADATA, metadataHeaders=["subject"]) so you're only requesting the data you need, etc.

Related

Why am I only receiving 50 messages with MessageManager.fetch()?

I'm in the process of debugging the /purge command for my Discord bot.
My intention is to fetch the entirety of a text channel, and delete any amount of messages, by calling the TextChannel.bulkDelete method multiple times, since that method has a limit of deleting 100 messages at a time. This is my code:
async purgeDelete(
channel: TextChannel,
amount: number | undefined,
target: GuildMember | undefined,
keyword: string | undefined
): Promise<number> {
// Most confused about this line: Am I doing the right thing?
const messages = await channel.messages.fetch();
const twoWeeksAgo = new Date();
twoWeeksAgo.setDate(twoWeeksAgo.getDate() - 14);
const purgelist = messages.filter(message => (
(!target || message.author.id === target.id)
&& (!keyword || message.content.includes(keyword))
&& this.resultMessage?.id !== message.id
&& message.createdAt > twoWeeksAgo
));
let purgeAmount: number;
if (amount === undefined) {
purgeAmount = purgelist.size;
} else {
console.log(purgelist.size, messages.size);
purgeAmount = Math.min(amount, purgelist.size);
}
const slicedPurgelist = purgelist.first(purgeAmount);
const partitionedPurgelist = [];
for (let i = 0; i < slicedPurgelist.length; i += 100) {
partitionedPurgelist.push(slicedPurgelist.slice(i, i + 100));
}
await Promise.all(partitionedPurgelist.map(messages => channel.bulkDelete(messages)));
return purgeAmount;
}
I'm pretty sure the only line that matters is the fetch() call. When called in my program, the API is giving me 50 messages. Is that intentional? I know there is an option for limit, but that only goes up to 100. If there is any workarounds to this, please let me know!
The Discord API has a hard limit of 100 messages per GET request. Unfortunately, this is a hard limit you can't bypass, and is intentional on Discord's part.
Furthermore, fetching the entirety of a text channel is probably a bad idea, especially with larger servers which could have 100k+ messages per channel.
A "sort-of" workaround is to use the before param in FetchMessageOptions plus a loop to continue fetching messages. See below for an example:
const messages = [];
const messagesToFetch = 1000
while(messages.length < messagesToFetch) {
// Handle first run
if(!messages.length) {
const msg = await channel.messages.fetch({ limit: 100 })
messages.push(msg)
continue;
}
// Fetch messages before the oldest message in the array
messages.push(await channel.messages.fetch({ limit: 100, before: messages[0].id }))
}

How To Circumvent 504 Errors

I am working in ReactJs and one of the main aspects of our project is the ability to upload a scorecard and have all of its results parsed and placed into objects. However, due to the nature of these pdfs that get uploaded, there's a LOT of information, an average of 12-14 pages.
Most of the information is irrelevant, I usually will only need pages 5-7, but users will be users, and they upload all 12.
I am using the pdfParser API which is very good, we're not looking for replacements on that. However, due to how large the file is, if I am somewhere with only half-decent connection, I am hit with a 504 error since the process takes so long. If I have good to great connection, there's no issue.
This being said I have two questions:
Is there a way to extend the amount of time that needs to elapse before my computer gives up on the process
Is there a way to parse only SOME of the pages that get submitted?
The relevant code will be shown below...
var url = 'https://pdftables.com/api?key=770oukvvx1wl&format=xlsx-single';
const pdfToExcel = (pdfFile) => {
var req = request.post({encoding: null, url: url}, async function (err, resp, body) {
if (!err && resp.statusCode == 200) {
fs.writeFile(`${pdfFile.path}.xlsx`, body, function(err) {
if (err) {
console.log('error writing file');
}
});
} else {
console.log('error retrieving URL');
};
});
var form = req.form();
form.append('file', fs.createReadStream(`./${pdfFile.path}`));
}
const parseExcel = async (file) => {
let workSheetsFromFile;
if (file.path.search(".xlsx") === -1) {
const filePath = await path.resolve(`./${file.path}.xlsx`)
workSheetsFromFile = await xlsx.parse(`./${file.path}.xlsx`);
await fs.unlinkSync(`./${file.path}`)
await fs.unlinkSync(filePath)
return workSheetsFromFile[0].data
}
if (file.path.search(".xlsx") !== -1) {
const filePath = await path.resolve(`./${file.path}`)
workSheetsFromFile = await xlsx.parse(`./${file.path}`);
await fs.unlinkSync(filePath)
return workSheetsFromFile[0].data
}
}

PATCH Request fails for one endpoint, but does work for other endpoints

I'm a bit new to testing and am working on editing tests for a MEAN-stack web app generated by the yeoman angular-fullstack generator. I've POSTed a dummy object into my mongo database and can see the object and its ID through the mongo shell.
I can perform GET and PUT requests on the object; however, trying to perform a PATCH request on the ID returns OPERATION_PATH_UNRESOLVABLE.
Does anyone have any suggestions as to why this may be happening?
I've included a sample of the code below, this code works perfectly for my other endpoints. I'm only receiving the error with one specific endpoint.
describe('PATCH /api/objects/:id', function() {
var patchedObject;
beforeEach(function(done) {
newObject.title = 'Patched Object';
newObject.section.Title = 'Patched Object Sec Title';
newObject.section.Body = 'Patched Object Sec Body';
newObject.section.Lists = ['Patched Sec List Item 0'];
newObject.images = ['N/A'];
newObject.date.startDate = '1/5/19';
newObject.date.endDate = '1/10/19';
newObject.duration = '5 Days';
newObject.location = 'VA';
newObject.isProgram = true;
newObject.hasRegistration = true;
newObject.linksOut.title = 'Patched Link';
newObject.linksOut.address = 'Patched Address';
newObject.backGround = 'black';
newObject.orderIndex = objects.length;
request(app)
.patch(`/api/promotions/${newObject._id}`)
.set('authorization', 'Bearer ' + token)
.send(newObject)
.expect(200)
.expect('Content-Type', /json/)
.end(function(err, res) {
if(err) {
return done(err);
}
patchedObject = res.body;
done();
});
});
afterEach(function() {
patchedObject = {};
});
it('should respond with the patched promotion when authenticated',
function() {
expect(patchedObject.title).to.equal('Patched Object');
});
});
I expect the output to be 200 but receive 500 Internal Server Error.
My Logger returns OPERATION_PATH_UNRESOLVABLE

Construct array in nodejs through for loop

I am using sails (0.11.0) running on nodejs (6.9.1). I am trying to construct an array by filling it through for loop. I would send this completed array in response to the client. I have tried various methods as suggested by people here on Stack Overflow, for example
the discussion here suggested
for (var i = yearStart; i < yearEnd+1; i++) {
arr.push(i);
}
On this discussion, it is suggested to use:
var array = calendars.map(function(item) {
return item.id;
});
console.log(array);
Similarly I tried many methods but I am coming across the same issue that during the loop, the array gets filled but as soon as the loop is completed, the array gets empty because of asynchronous process and therefore I can not send the response. To tackle with this I tried checking the index inside the loop body and send response from inside the loop body itself through
var userArray = [];
_.each(users, function(user, index){
MySQLConnector.query('CALL user_image (?)', [user.id], function(err, userImage){
if(err){
return res.json({"status":"some_error"});
}else{
userID = user.id
userImageID = userImage[0][0].id;
var userInfo = {
userID: userID,
userImageID: userImageID
}
userArray.push(userInfo)
if(index == users.length - 1){
res.json({selectedUsers: userArray});
}
}
});
});
I am initiating an empty userArray and then iterate through users object where each element of the object is characterized by name user and an index. Through a MySQL query I am fetching the userImage object and in each iteration, I am creating an object called userInfo that consists of userID and userImageID. I am pushing this object into userArray. And after each iteratio of the for loop (_.each), I check if last index is reached. Once last index is reached, the final array is sent as response before loop body is complete.
Here too I have an issue that the array body is not always completely filled. The reason is due to asynchronous process, the index does not always follow the order 0,1,2,3,4,.... and it can start with any number and can jump to any index in the next iteration, for example the first index to start would be 4, the second would be 0, third would be 2 and so on. This sequence would be different for every time we run this for loop. For a user, it will appear to be a total random process. Therefore if users.length is 8, and current index is randomly 7 at third iteration, the condition index == users.length - 1 will be met and response will be sent just with an array consisting of 3 elements rather than 8.
Can someone suggest me a better and robust way to fill an array through the for loop in nodejs and send that array in response, so that all items are included in the array in their original order?
As you are using node js , it is better to use any promises library like bluebird or async to handle Async requests.
The reason your loop is not working as expected is because as you've pointed out, due to async requests taking time to resolve for which _.each loop is not waiting.
Using bluebird, it can be done with Promise.map method which works as explained below from the documentaion :
Given an Iterable(arrays are Iterable), or a promise of an Iterable,
which produces promises (or a mix of promises and values), iterate
over all the values in the Iterable into an array and map the array to
another using the given mapper function.
Promises returned by the mapper function are awaited for and the
returned promise doesn't fulfill until all mapped promises have
fulfilled as well. If any promise in the array is rejected, or any
promise returned by the mapper function is rejected, the returned
promise is rejected as well.
Hence, Using Promise.map your code can be updated like below :
var Promise = require("bluebird");
return Promise.map(users, function(user, index){
return MySQLConnector.query('CALL user_image (?)', [user.id], function(err, userImage){
if(err){
return Promise.reject({"status":"some_error"});
}else{
userID = user.id
userImageID = userImage[0][0].id;
var userInfo = {
userID: userID,
userImageID: userImageID
}
return userInfo;
}
});
})
.then(function (usersArray){
res.json({selectedUsers: usersArray});
})
.catch(function (err){
res.json(err);
});
You can execute loops with functions with callbacks synchronously using SynJS:
var SynJS = require('synjs');
var mysql = require('mysql');
var connection = mysql.createConnection({
host : 'localhost',
user : 'tracker',
password : 'tracker123',
database : 'tracker'
});
function myFunction1(modules,connection,users) {
var ret=[];
for(var i=0; i<users.length; i++) {
connection.query("SELECT CONCAT('some image of user #',?) AS userImage", [users[i]], function(err, rows, fields) {
if (err) throw err;
ret.push({
id: users[i],
image: rows[0].userImage
});
modules.SynJS.resume(_synjsContext); // <-- indicate that callback is finished
});
SynJS.wait(); // <-- wait for callback to finish
}
return ret;
};
var modules = {
SynJS: SynJS,
mysql: mysql,
};
var users = [1,5,7,9,20,21];
SynJS.run(myFunction1,null,modules,connection,users,function (ret) {
console.log('done. result is:');
console.log(ret);
});
Result would be following:
done. result is:
[ { id: 1, image: 'some image of user #1' },
{ id: 5, image: 'some image of user #5' },
{ id: 7, image: 'some image of user #7' },
{ id: 9, image: 'some image of user #9' },
{ id: 20, image: 'some image of user #20' },
{ id: 21, image: 'some image of user #21' } ]

How to use Bluebird promisification with generators + parallel promises

Trying to fire off mutiple requests off to the beats api using bluebird as well as koa for generators.
After reading some documentation I figured the following would work
var request = require('co-request'),
_ = require('lodash'),
Promise = require('bluebird');
request = Promise.promisifyAll(request);
module.exports.getTracks = function *tracks(){
firstCall = yield makeAPICall('users/' + me + '/mymusic/tracks?limit=150');
total = firstCall.body.info.total;
total -= 150;
tracks = firstCall.body.data;
//Beats only allows a maximum of 150 tracks per call
//If more tracks are needed then the remainder is called in sets of 150
var offset = 150;
while (total > 0) {
promises.push(makeAPICall('users/' + me + '/mymusic/tracks?limit=150&offset=' + offset));
offset += 150;
total -= 150;
}
var responses = yield(Promise.all(promises));
}
function makeAPICall (query){
var authOptions = {
url: 'https://partner.api.beatsmusic.com/v1/api/' + query,
headers: { 'Authorization': 'Bearer ' + accessToken },
json: true
};
return request.get(authOptions);
}
The method makeAPI call works as expected used with firstCall, but for some reason when I start placing the makeAPICall method into the array they never seem to execute. The variable responses yields out just an array of functions instead of an array of responses from the beats api. What do I need to change to make responses return an array of objects similar to that of firstCall?
Your using co-request which already converts callbacks to thunks, so there is no need to try and promisify things.
Here is a simplified runnable example, similar to your code, showing how to run api calls in parallel with Koa (which uses co under the hood).
When you yield an array, co will run any thunks/promises/generators etc in parallel.
var request = require('co-request'),
co = require('co');
co(function *(){
var results = yield getTracks();
results.forEach(function(result){
console.log(result.body);
})
}).then();
function * getTracks () {
var queries = [];
// swap out your queries here
queries.push(makeAPICall('5185415ba171ea3a00704eed'));
queries.push(makeAPICall('54fdc3c9862a3aab01dc95cf'));
queries.push(makeAPICall('54fdc3da862a3aa501dc95d0'));
// yielding an array returns an array of results
var results = yield queries;
return results;
}
function makeAPICall (query) {
var options = {
url: 'http://www.mocky.io/v2/' + query,
json: true
}
return request.get(options)
}

Resources