third_party/zxcvbn-cpp/data-scripts/count_xato.coffee - chromium/src - Git at Google

 matching = require '../lib/matching'
 scoring = require '../lib/scoring'

 fs = require 'fs'
 byline = require 'byline'
 sprintf = require('sprintf-js').sprintf


 check_usage = () ->
   usage = '''

   Run a frequency count on the raw 10M xato password set and keep counts over CUTOFF in
   descending frequency. That file can be found by googling around for:
   "xato 10-million-combos.txt"

   Passwords that both:
   -- fully match according to zxcvbn's date, year, repeat, sequence or keyboard matching algs
   -- have a higher rank than the corresponding match guess number

   are excluded from the final password set, since zxcvbn would score them lower through
   other means anyhow. in practice this rules out dates and years most often and makes room
   for more useful data.

   To use, first run from zxcvbn base dir:

   npm run build

   then change into data-scripts directory and run:

   coffee count_xato.coffee --nodejs xato_file.txt ../data/passwords.txt

   '''
   valid = process.argv.length == 5
   valid = valid and process.argv[0] == 'coffee' and process.argv[2] in ['--nodejs', '-n']
   valid = valid and __dirname.split('/').slice(-1)[0] == 'data-scripts'
   unless valid
     console.log usage
     process.exit(0)

 # after all passwords are counted, discard pws with counts <= COUNTS
 CUTOFF = 10

 # to save memory, after every batch of size BATCH_SIZE, go through counts and delete
 # long tail of entries with only one count.
 BATCH_SIZE = 1000000

 counts = {}       # maps pw -> count
 skipped_lines = 0 # skipped lines in xato file -- lines w/o two tokens
 line_count = 0    # current number of lines processed

 normalize = (token) ->
   token.toLowerCase()

 should_include = (password, xato_rank) ->
   for i in [0...password.length]
     if password.charCodeAt(i) > 127
       # xato mostly contains ascii-only passwords, so in practice
       # this will only skip one or two top passwords over the cutoff.
       # were that not the case / were this used on a different data source, consider using
       # a unidecode-like library instead, similar to count_wikipedia / count_wiktionary
       console.log "SKIPPING non-ascii password=#{password}, rank=#{xato_rank}"
       return false
   matches = []
   for matcher in [
     matching.spatial_match
     matching.repeat_match
     matching.sequence_match
     matching.regex_match
     matching.date_match
     ]
     matches.push.apply matches, matcher.call(matching, password)
   matches = matches.filter (match) ->
     # only keep matches that span full password
     match.i == 0 and match.j == password.length - 1
   for match in matches
     if scoring.estimate_guesses(match, password) < xato_rank
       # filter out this entry: non-dictionary matching will assign
       # a lower guess estimate.
       return false
   return true

 prune = (counts) ->
   for pw, count of counts
     if count == 1
       delete counts[pw]

 main = (xato_filename, output_filename) ->
   stream = byline.createStream fs.createReadStream(xato_filename, encoding: 'utf8')
   stream.on 'readable', ->
     while null != (line = stream.read())
       line_count += 1
       if line_count % BATCH_SIZE == 0
         console.log 'counting tokens:', line_count
         prune counts
       tokens = line.trim().split /\s+/
       unless tokens.length == 2
         skipped_lines += 1
         continue
       [username, password] = tokens[..1]
       password = normalize password
       if password of counts
         counts[password] += 1
       else
         counts[password] = 1
   stream.on 'end', ->
     console.log 'skipped lines:', skipped_lines
     pairs = []
     console.log 'copying to tuples'
     for pw, count of counts
       if count > CUTOFF
         pairs.push [pw, count]
       delete counts[pw] # save memory to avoid v8 1GB limit
     console.log 'sorting'
     pairs.sort (p1, p2) ->
       # sort by count. higher counts go first.
       p2[1] - p1[1]
     console.log 'filtering'
     pairs = pairs.filter (pair, i) ->
       rank = i + 1
       [pw, count] = pair
       should_include pw, rank
     output_stream = fs.createWriteStream output_filename, encoding: 'utf8'
     for pair in pairs
       [pw, count] = pair
       output_stream.write sprintf("%-15s %d\n", pw, count)
     output_stream.end()

 check_usage()
 main process.argv[3], process.argv[4]
	matching = require '../lib/matching'
	scoring = require '../lib/scoring'

	fs = require 'fs'
	byline = require 'byline'
	sprintf = require('sprintf-js').sprintf


	check_usage = () ->
	usage = '''

	Run a frequency count on the raw 10M xato password set and keep counts over CUTOFF in
	descending frequency. That file can be found by googling around for:
	"xato 10-million-combos.txt"

	Passwords that both:
	-- fully match according to zxcvbn's date, year, repeat, sequence or keyboard matching algs
	-- have a higher rank than the corresponding match guess number

	are excluded from the final password set, since zxcvbn would score them lower through
	other means anyhow. in practice this rules out dates and years most often and makes room
	for more useful data.

	To use, first run from zxcvbn base dir:

	npm run build

	then change into data-scripts directory and run:

	coffee count_xato.coffee --nodejs xato_file.txt ../data/passwords.txt

	'''
	valid = process.argv.length == 5
	valid = valid and process.argv[0] == 'coffee' and process.argv[2] in ['--nodejs', '-n']
	valid = valid and __dirname.split('/').slice(-1)[0] == 'data-scripts'
	unless valid
	console.log usage
	process.exit(0)

	# after all passwords are counted, discard pws with counts <= COUNTS
	CUTOFF = 10

	# to save memory, after every batch of size BATCH_SIZE, go through counts and delete
	# long tail of entries with only one count.
	BATCH_SIZE = 1000000

	counts = {} # maps pw -> count
	skipped_lines = 0 # skipped lines in xato file -- lines w/o two tokens
	line_count = 0 # current number of lines processed

	normalize = (token) ->
	token.toLowerCase()

	should_include = (password, xato_rank) ->
	for i in [0...password.length]
	if password.charCodeAt(i) > 127
	# xato mostly contains ascii-only passwords, so in practice
	# this will only skip one or two top passwords over the cutoff.
	# were that not the case / were this used on a different data source, consider using
	# a unidecode-like library instead, similar to count_wikipedia / count_wiktionary
	console.log "SKIPPING non-ascii password=#{password}, rank=#{xato_rank}"
	return false
	matches = []
	for matcher in [
	matching.spatial_match
	matching.repeat_match
	matching.sequence_match
	matching.regex_match
	matching.date_match
	]
	matches.push.apply matches, matcher.call(matching, password)
	matches = matches.filter (match) ->
	# only keep matches that span full password
	match.i == 0 and match.j == password.length - 1
	for match in matches
	if scoring.estimate_guesses(match, password) < xato_rank
	# filter out this entry: non-dictionary matching will assign
	# a lower guess estimate.
	return false
	return true

	prune = (counts) ->
	for pw, count of counts
	if count == 1
	delete counts[pw]

	main = (xato_filename, output_filename) ->
	stream = byline.createStream fs.createReadStream(xato_filename, encoding: 'utf8')
	stream.on 'readable', ->
	while null != (line = stream.read())
	line_count += 1
	if line_count % BATCH_SIZE == 0
	console.log 'counting tokens:', line_count
	prune counts
	tokens = line.trim().split /\s+/
	unless tokens.length == 2
	skipped_lines += 1
	continue
	[username, password] = tokens[..1]
	password = normalize password
	if password of counts
	counts[password] += 1
	else
	counts[password] = 1
	stream.on 'end', ->
	console.log 'skipped lines:', skipped_lines
	pairs = []
	console.log 'copying to tuples'
	for pw, count of counts
	if count > CUTOFF
	pairs.push [pw, count]
	delete counts[pw] # save memory to avoid v8 1GB limit
	console.log 'sorting'
	pairs.sort (p1, p2) ->
	# sort by count. higher counts go first.
	p2[1] - p1[1]
	console.log 'filtering'
	pairs = pairs.filter (pair, i) ->
	rank = i + 1
	[pw, count] = pair
	should_include pw, rank
	output_stream = fs.createWriteStream output_filename, encoding: 'utf8'
	for pair in pairs
	[pw, count] = pair
	output_stream.write sprintf("%-15s %d\n", pw, count)
	output_stream.end()

	check_usage()
	main process.argv[3], process.argv[4]