I have written a chunk of Node that is acting as a web hook for a GitHub repository I have. Basically when a commit is made to that repo, a request gets made to my function which then looks for potentially sensitive data that has been checked in. I'm making multiple calls to the GitHub API to pull down data. I tried to handle all of these calls by using the async package's waterfall
and each
functions.
There are two sections to the waterfall call: getting the commit data and then getting the file data. Within the "file data" section I have the async each
that is getting the actual file contents for each file.
I'm wondering if anyone can see any obvious best practices that I am not following for JavaScript programming. Am I handling the asynchronous calls the correct way? I created two helper functions to try and keep things clean/readable - should I break any other pieces out into their own functions?
var https = require('https')
var async = require('async')
module.exports = function (ctx,cb) {
console.log("**********************************************")
var danger = ['password','pass','pw','pwd','passwd','key','user','username','secret']
var re = new RegExp(danger.join('|'))
async.waterfall([
function(callback){
var options = {
host: 'api.github.com',
path: getApiPath(ctx.data.head_commit.url),
headers: {
'User-Agent': 'Broham'
}
}
getData(options, function(commitData){
// console.log("commit data", commitData)
callback(null,commitData.files)
})
},
//for each file, get the file content and look for potentially sensitive information that has been commited
function(files, callback){
// console.log("files", files)
var review = []
async.each(files, function(file, fileCallback){
var fileOptions = {
host: 'api.github.com',
path: file.contents_url.replace('https://api.github.com', ''),
headers: {
'User-Agent': 'Broham'
}
}
// get the JSON that will have the URL for the raw file content
getData(fileOptions, function(fileData){
// get the raw file content
https.get(fileData.download_url, function(contentData){
contentData.setEncoding('utf-8')
var contents = ''
contentData.on('data', function (data) {
contents += data
})
contentData.on('end', function(){
console.log('contents', contents)
// check to see if there are any matches for the terms we want to review.
matches = contents.match(re)
if(matches.length > 0){
// console.log("matches", matches)
review = review.concat(contents.match(re))
}
fileCallback()
})
})
})
}, function(err){
callback(null, review)
})
}
], function(err, results){
if(results.length > 0){
console.log("For review:", results)
}
cb(null, 'GitEye complete!')
console.log("**********************************************")
})
}
function getData(options, callback){
https.get(options, function(res){
res.setEncoding('utf-8')
var body = ''
res.on('data', function (data) {
body += data
})
res.on('end', function(){
// console.log('body', body)
var data = JSON.parse(body);
callback(data)
})
})
}
function getApiPath(url){
var pieces = url.split('/')
var user = pieces[3]
var repo = pieces[4]
var commit = pieces[6]
// return "https://api.github.com/repos/" + user + "/" + repo + "/commits/" + commit
return "/repos/" + user + "/" + repo + "/commits/" + commit
}
1 Answer 1
Looking for sensitive terms
I see that you build a regex from a list of terms:
var danger = ['password','pass','pw','pwd','passwd','key','user','username','secret'] var re = new RegExp(danger.join('|'))
With this approach,
some of the items in danger
are redundant,
because "password" and "passwd" will be already matched by "pass",
"pwd" will be already matched by "pw",
and "username" will be already matched by "user".
Overall this approach looks rather primitive and I suspect it will yield too many false positives to be useful.
Duplicated calls
In this code contents.match(re)
is executed twice:
matches = contents.match(re) if(matches.length > 0){ // console.log("matches", matches) review = review.concat(contents.match(re)) }
That's a waste, it would be better to use matches
in place of the second call.
Truthy values in JavaScript
Non-empty arrays are truthy, so instead of if(matches.length > 0)
you could write simply if (matches.length)
, and even just if (matches)
.
The same goes for if(results.length > 0)
later in the code.