Rejecting hash contents if they are not in array - arrays

I have this array:
array = ["1", "2", "3", "4"]
I have this array of hashes:
ah = [
{:id=>"1", :value=>"A"},
{:id=>"2", :value=>"B"},
{:id=>"3", :value=>"C"},
{:id=>"4", :value=>"D"},
{:id=>"5", :value=>"E"},
{:id=>"6", :value=>"F"},
{:id=>"7", :value=>"G"},
{:id=>"8", :value=>"H"},
]
I need to reject any hash in ah whose id is not in array.
What is the best way of achieving this?

You can select the inverse - the hashes whose id is in array by using this code:
ah.select{|el| array.include?(el[:id])}
If you prefer reject, you can use:
ah.reject{|el| !array.include?(el[:id])}
For more info: Array#reject, Array#select.
These methods create a new array, if you want to modify in place use Array#reject! or Array#select!.

For big pieces of data I would go with some preprocessing to avoid O(n*m) lookups.
array = ["1", "2", "3", "4"]
array_hash = array.each_with_object({}){ |i, h| h[i] = true }
ah.select{ |obj| array_hash[obj[:id]] }

I realize there is already an accepted answer but since all the answers here are in O(n*m), I thought I'd propose an alternative in O(n)*.
Here's a rough benchmark if the ah array has 100_000 items and we have 10_000 items in the sub array. I'm including fl00r's answer here and Cary's as we're all trying to avoid the O(n*m) scenario.
user system total real
select with include 34.610000 0.110000 34.720000 ( 34.924679)
reject with include 34.320000 0.100000 34.420000 ( 34.611992)
group and select 0.170000 0.010000 0.180000 ( 0.182358)
select by value 0.040000 0.000000 0.040000 ( 0.041073)
select with set 0.040000 0.000000 0.040000 ( 0.048331)
hashify then values 0.130000 0.010000 0.140000 ( 0.139686)
The code to reproduce this:
require 'benchmark'
require 'set'
list_size = 100_000
sub_list_size = 10_000
ah = Array.new(list_size) { |i| { id: i, value: "A" } }
array = []
sub_list_size.times { array << (0..list_size).to_a.sample }
def group_than_select(ah, array)
grouped = ah.group_by { |x| x[:id] }
good_keys = grouped.keys - array
good_keys.map { |i| grouped[i] }.flatten
end
def select_by_fl00r(ah, array)
array_hash = array.each_with_object({}){ |i, h| h[i] = true }
ah.select{ |obj| array_hash[obj[:id]] }
end
def select_with_set(ah, array)
array_to_set = array.to_set
ah.select { |h| array_to_set.include?(h[:id]) }
end
def hashify_then_values_at(ah, array)
h = ah.each_with_object({}) { |g,h| h.update(g[:id]=>g) }
h.values_at(*(h.keys & array))
end
Benchmark.bm(25) do |x|
x.report("select with include") do
ah.select{|el| array.include?(el[:id])}
end
x.report("reject with include") do
ah.reject{|e| !array.include?(e[:id])}
end
x.report("group and select") do
group_than_select(ah, array)
end
x.report("select by value") do
select_by_fl00r(ah, array)
end
x.report("select with set") do
select_with_set(ah, array)
end
x.report("hashify then values") do
hashify_then_values_at(ah, array)
end
end
Hash maps are typically O(1) search though O(n) worst case is possible.

A better solution than rejecting those ids that are not in the array is to only accept the ones that do:
ah.select { |hash| array.include?(hash[:id]) }

Here are two more possibilities.
array = ["1", "2", "3", "4", "99999999"]
#1
I expect the include? solutions would be considerably faster if array were first converted to a set:
require 'set'
def select_with_set(ah, array)
array_to_set = array.to_set
ah.select { |h| array_to_set.include?(h[:id]) }
end
select_with_set(ah, array)
#=> [{:id=>"1", :value=>"A"}, {:id=>"2", :value=>"B"},
# {:id=>"3", :value=>"C"}, {:id=>"4", :value=>"D"}]
#2
If, as in the example, the hash elements of ah have distinct values for :id, one could do this:
def hashify_then_values_at(ah, array)
h = ah.each_with_object({}) { |g,h| h.update(g[:id]=>g) }
h.values_at(*(h.keys & array))
end
hashify_then_values_at(ah, array)
#=> [{:id=>"1", :value=>"A"}, {:id=>"2", :value=>"B"},
# {:id=>"3", :value=>"C"}, {:id=>"4", :value=>"D"}]

Related

Get the average of numbers in an array which is the values of an hash

in a Ruby program I have an hash which has normal strings as keys and the values are array of numbers:
hash_1 = {"Luke"=> [2,3,4], "Mark"=>[3,5], "Jack"=>[2]}
And what I'm looking for is to have as result the same hash with the values that become the average of the numbers inside the arrays:
{"Luke"=> 3, "Mark"=>4, "Jack"=>2}
One way to make it to work can be to create a new empty hash_2, loop over hash_1 and within the block assign the keys to hash_2 and the average of the numbers as values.
hash_2 = {}
hash_1.each do |key, value|
hash_2[key] = value.sum / value.count
end
hash_2 = {"Luke"=> 3, "Mark"=>4, "Jack"=>2}
Is there a better way I could do this, for instance without having to create a new hash?
hash_1 = {"Luke"=> [2,3,4], "Mark"=>[3,5], "Jack"=>[2]}
You don't need another hash for the given below code.
p hash_1.transform_values!{|x| x.sum/x.count}
Result
{"Luke"=>3, "Mark"=>4, "Jack"=>2}
def avg(arr)
return nil if arr.empty?
arr.sum.fdiv(arr.size)
end
h = { "Matthew"=>[2], "Mark"=>[3,6], "Luke"=>[2,3,4], "Jack"=>[] }
h.transform_values { |v| avg(v) }
#=> {"Matthew"=>2.0, "Mark"=>4.5, "Luke"=>3.0, "Jack"=>nil}
#Виктор
OK. How about this:
hash_1 = {"Luke"=> [2,3,4], "Mark"=>[3,5], "Jack"=>[2], "Bobby"=>[]}
hash_2 = hash_1.reduce(Hash.new(0)) do |acc, (k, v)|
v.size > 0 ? acc[k] = v.sum / v.size : acc[k] = 0
acc
end
p hash_2
This solution is different than the one that use transform_values! because return a new Hash object.
hash_1.map { |k,v| [k, v.sum / v.size] }.to_h

Merge hashes of arrays based on similar positions with Ruby

I have the following two hashes with arrays as values.
a = {
"Us" => [["1", ["1", "2"]], ["2", ["1"]]],
"Pa" => [["1", ["1", "3", "5"]], ["4", ["7"]]]
}
b = {
"Us" => [["1", ["F", "O"]], ["2", ["N"]]],
"Pa" => [["1", ["S", "D", "H"]], ["4", ["K"]]]
}
I'm trying to merge the hashes to get a final has like this:
c = {
"Us" => [["1", ["1|F", "2|O"]], ["2", ["1|N"]]],
"Pa" => [["1", ["1|S", "3|D", "5|H"]], ["4", ["7|K"]]]
}
I found the following code with merge, and tried to apply it to my issue, but I got an error:
a.merge(b) {|key, a_val, b_val| a_val.merge b_val }
# >> NoMethodError: undefined method `merge' for [["1", ["1", "2"]], ["2", ["1"]]]:Array
I even got an error with a + b:
a + b
# >> NoMethodError: undefined method `+' for #<Hash:0x0000060078e460>
<<<< UPDATE >>>>
Thanks both Cary and tadman. Outside the original question I show the input file I have and the output I´m tryng to obtain. I show in order for you get an idea why
I generated 2 hashes in that way. In the output I create blocks where fathers are unique values of column 1, below children (unique values in column 2 related with col 1).
Column 3 are subchildren that belong to a value in col2 and column 4 are text contents related with col3.
Probably hash "c" is easier to generate from the beginning.
This is my input file
Main,Stage1,Stage2,Description
Us,1,1,F
Us,1,2,O
Us,2,1,N
Pa,1,1,S
Pa,1,3,D
Pa,1,5,H
Pa,4,7,K
This is the output I almost get.
Main..Stage1..Stage2..Description
Us
......1
..............1.......F
..............2.......O
......2
..............1.......N
Pa
......1
..............1.......S
..............3.......D
..............5.......H
......4
..............7.......K
Then I was able to create this code, but like tadman says, I need to reorder the way I get this to make easier the things, since
I use 4 hashes. After I create hash "a" and "b" I was stuck, since I needed a unique hash to iterate and be able to print in the output structure shown above.
My code before post the question
X = Hash.new{|hsh,key| hsh[key] = [] }
Y = Hash.new{|hsh,key| hsh[key] = [] }
a = Hash.new{|hsh,key| hsh[key] = [] }
b = Hash.new{|hsh,key| hsh[key] = [] }
File.foreach('file.txt').with_index do
|line, line_num|
if line_num > 0
r = line.split(",")
X[r[0] + "°" + r[1]].push r[2]
Y[r[0] + "°" + r[1]].push r[3].strip
end
end
X.each{ |k,v|
lbs = k.split("°")
a[lbs[0]].push [ lbs[1], v] #Here I generate hash a
}
Y.each{ |k,v|
lbs = k.split("°")
b[lbs[0]].push [ lbs[1], v] #Here I generate hash b
}
What you have here is going to require a bit of work to solve because of all the complicated nesting. This would be a lot easier if you did some work to reorder how that data is stored.
Yet you can do this:
a={"Us"=>[["1", ["1", "2"]], ["2", ["1"]]], "Pa"=>[["1", ["1", "3", "5"]], ["4", ["7"]]]}
b={"Us"=>[["1", ["F", "O"]], ["2", ["N"]]], "Pa"=>[["1", ["S", "D", "H"]], ["4", ["K"]]]}
c = a.keys.map do |k|
ah = a[k].to_h
bh = b[k].to_h
[
k,
ah.keys.map do |ka|
[
ka,
ah[ka].zip(bh[ka]).map do |pair|
pair.join('|')
end
]
end
]
end.to_h
# => {"Us"=>[["1", ["1|F", "2|O"]], ["2", ["1|N"]]], "Pa"=>[["1", ["1|S", "3|D", "5|H"]], ["4", ["7|K"]]]}
The key here is rigorous use of map to transform each layer and zip to "zipper" two arrays together into pairs that can then be combined with join into the desired string target. Cast back to a Hash with to_h at the end and you get what you want.
There's an intermediate conversion for each subset to a hash to handle out-of-order situations where one might specify the apparent "keys" in a different sequence.
What you'll want to do is wrap this up in a method with a descriptive name:
def hash_compactor(a,b)
# ... (code) ...
end
That'll help keep it modular. Normally I try and create solutions that handle N arguments by defining it as:
def hash_compactor(*input)
# ...
end
Where input is then an array of various sets in the form you've given. The resulting code is surprisingly a lot more complicated.
Note this makes a lot of assumptions about the input being perfectly matched and will explode if that's not the case.
I suggest you first convert the values of one of the hashes to hashes, as I will explain. Suppose we create a new b.
newbie = b.transform_values(&:to_h)
#=> {"Us"=>{"1"=>["F", "O"], "2"=>["N"]},
# "Pa"=>{"1"=>["S", "D", "H"], "4"=>["K"]}}
We can now use a and newbie to produce the desired return value.
a.each_with_object({}) do |(k,v),h|
h[k] = v.map do |first, arr|
[first, arr.zip(newbie[k][first]).map { |pair| pair.join('|') }]
end
end
#=> {"Us"=>[["1", ["1|F", "2|O"]], ["2", ["1|N"]]],
# "Pa"=>[["1", ["1|S", "3|D", "5|H"]], ["4", ["7|K"]]]}
If a can be mutated it's slightly easier.
a.each do |k,v|
v.map! do |first, arr|
[first, arr.zip(newbie[k][first]).map { |pair| pair.join('|') }]
end
end
The method Hash#trasform_values made its debut in Ruby v2.4. To support older versions, one compute newbie as follows.
newbie = b.each_with_object({}) {|(k,v),h| h[k] = v.to_h }
In this solution we'll keep the original structure.
I've followed your first try but instead of:
a.merge(b) {|key, a_val, b_val| a_val.merge b_val }
Think about use a new custom merge function like:
c = a.merge(b) {|key, a_val, b_val| myMergeArray(a_val, b_val) }
Then the new merge function is a simple recursive one:
def myMergeArray(a,b,sep = '|')
c = a
c.each_with_index { |e, i|
if c[i].is_a? Array
c[i] = myMergeArray(c[i], b[i], sep)
else
c[i] = c[i] + sep + b[i] if c[i] != b[i]
end
}
return c
end
I've assumed that in case of equal elements, just save one, so e.g. "Y" and "Y" yield just "Y" instead of "Y|Y"
Cheers!

Ruby -- convert a nested hash to a multidimensional array

I have a hash which is named h. I want to store the contents in a multidimensional array named ar. I am getting the error no implicit conversion from nil to integer.
Here is my code:
h = {"bob" => {email: "abc" , tel: "123"} , "daisy" => {email: "cab" , tel: "123456"}}
keys = h.keys
l = h.length
ar = Array.new(l) { Array.new(3) }
for i in 0..l-1
ar[[2][i]] = keys[i]
ar[[1][i]] = h[keys[i]][:email]
ar[[0][i]] = h[keys[i]][:tel]
end
puts ar.to_s
The desired output is:
[[email_1, email_2, ..][tel_1, tel_2, ..][name_1, name_2, ..]]
For example:
[["abc", "cab"] , ["123", "123456"] , ["bob", "daisy"]]
This is the way I would handle this:
h.values.each_with_object({}) do |h,obj|
obj.merge!(h) { |_k,v1,v2| ([v1] << v2).flatten }
end.values << h.keys
#=> [["abc", "cab"], ["123", "123456"], ["bob", "daisy"]]
First grab all the values (as Hashes)
loop through them with an accumulator ({})
merge! the values into the accumulator and on conflict append them to an array
return the values from the accumulator
then append the original keys
This is less explicit than #mudasobwa's answer and relies on the order of the first value to determine the output. e.g. if :tel came before :email the first 2 elements would have a reversed order
[2][i] returns nil for i > 0. ar[nil] raises the exception.
Here is what you do:
arr = h.map { |k, v| [v[:email], v[:tel], k] }.reduce(&:zip)
To make your code work:
Change
ar = Array.new(l) { Array.new(3) }
To
ar = Array.new(3) { Array.new(l) }
Change
ar[[2][i]] = keys[i]
ar[[1][i]] = h[keys[i]][:email]
ar[[0][i]] = h[keys[i]][:tel]
To
ar[2][i] = keys[i]
ar[1][i] = h[keys[i]][:email]
ar[0][i] = h[keys[i]][:tel]
What you mostly should do is to stop writing PHP code with Ruby syntax. Here it’s how is to be done in Ruby:
h.map { |k, v| [v[:email], v[:tel], k] }.reduce(&:zip)
or, even better, if you are certain of elements order in nested hashes:
h.map { |k, v| [*v.values, k] }.reduce(&:zip).map(&:flatten)
All the methods map, reduce and zip are thoroughly described in the documentation.
h.map { |k, v| [*v.values_at(:email, :tel), k] }.transpose
#=> [["abc", "cab"], ["123", "123456"], ["bob", "daisy"]]
The intermediate calculation is as follows.
h.map { |k, v| [*v.values_at(:email, :tel), k] }
#=> [["abc", "123", "bob"], ["cab", "123456", "daisy"]]

How do I check if all the elements in my array occur no more than twice?

I'm using Ruby 2.4. Let's say I have an array of strings (which are all just string-i-fied (is that a word?) integers ...
["1", "2", "5", "25", "5"]
How do I write a function that tells me if all the elements in the array occur no more than twice in the array? For example, this array
["1", "3", "3", "55", "3", "2"]
would return false because "3" occurs three times, but this array
["20", "10", "20", "10"]
would return true because none of the elements occur more than twice.
Enumerable#group_by will do the heavy lifting for this:
def no_element_present_more_than_twice?(a)
a.group_by(&:itself).none? do |_key, values|
values.count > 2
end
end
p no_element_present_more_than_twice?(["1", "3", "3", "55", "3", "2"])
# => false
p no_element_present_more_than_twice?(["20", "10", "20", "10"])
You can determine frequency like this:
frequency = array.reduce(Hash.new(0)) do |counts, value|
counts[value] += 1
counts
end
# => { "1" => 1, "3" => 3, "55" => 1, "2" => 1 }
And you can check if any of them occur more than twice like this:
frequency.values.max > 2
If you want to wrap it up nicely, you can add it to Enumerable:
module Enumerable
def frequency
f = Hash.new(0)
each { |v| f[v] += 1 }
f
end
end
And then your condition is as simple as:
array.frequency.values.max > 2
Note: this comes as part of Facets.
I've taken it upon myself to benchmark all of your options for you :)
Running each test 1024 times. Test will take about 34 seconds.
_akuhn is faster than _vlasiak by 16x ± 1.0
_vlasiak is faster than _wayne by 3.5x ± 0.1
_wayne is faster than _cary by 10.0% ± 1.0%
_cary is faster than _oneneptune by 10.09% ± 1.0%
_oneneptune is similar to _coreyward
_coreyward is faster than _tadman by 10.0% ± 1.0%
_tadman is faster than _sagarpandya82 by 10.0% ± 1.0%
_sagarpandya82 is faster than _glykyo by 80.0% ± 1.0%
As you can see, #akuhn's answer performs far better than the other algorithms because it exits early once a match has been found.
Note: I edited the answers to produce the same result, but did not edit any of them for optimisation.
Here is the script which produced the benchmarks:
require 'fruity'
arr = Array.new(1000) { |seed|
# seed is used to create the same array on each script run,
# hence the same benchmark results will be produced
Random.new(seed).rand(1..10).to_s
}
class Array
def difference(other)
h = other.each_with_object(Hash.new(0)) { |e,h| h[e] += 1 }
reject { |e| h[e] > 0 && h[e] -= 1 }
end
end
compare do
_coreyward do
arr.reduce(Hash.new(0)) { |counts, value|
counts[value] += 1
counts
}.max[1] <= 2
end
_wayne do
arr.group_by(&:itself).none? do |_key, values|
values.count > 2
end
end
_sagarpandya82 do
arr.sort_by(&:to_i).each_cons(3).none? { |a,b,c| a == b && b == c }
end
_tadman do
arr.sort.slice_when { |a,b| a != b }.map(&:length).max.to_i <= 2
end
_cary do
arr.difference(arr.uniq*2).empty?
end
_akuhn do
count = Hash.new(0)
arr.none? { |each| (count[each] += 1) > 2 }
end
_oneneptune do
arr.each_with_object(Hash.new(0)) { |element,counts|
counts[element] += 1
}.values.max < 3
end
_glykyo do
arr.uniq.map{ |element| arr.count(element) }.max <= 2
end
_vlasiak do
arr.none? { |el| arr.count(el) > 2 }
end
end
Try this
count = Hash.new(0)
array.none? { |each| (count[each] += 1) > 2 }
# => true or false
How does this work?
Hash.new(0) creates a hash with default value 0
none? checks a block for all elements and returns whether no element matches
count[each] += 1 increases the count (no nil case since default value is 0)
This is an optimal solution since it breaks as soon as the first offending element is found. All other solution posted here either scan the entire array or have even worse complexity.
NB, if you want to know which elements appear more than twice (for example to print an error message) use find or find_all instead of none?.
Here's another way, using the method Array#difference:
def twice_at_most?(arr)
arr.difference(arr.uniq*2).empty?
end
where Array#difference is defined as follows:
class Array
def difference(other)
h = other.each_with_object(Hash.new(0)) { |e,h| h[e] += 1 }
reject { |e| h[e] > 0 && h[e] -= 1 }
end
end
After having found many uses for Array#difference, I have proposed that it be adopted as a core method. The document at this link explains how the method works and provides examples of its use.
Let's try it.
twice_at_most? [1, 4, 2, 4, 1, 3, 4]
#=> false
Here
arr.uniq*2
#=> [1, 4, 2, 3, 1, 4, 2, 3]
arr.difference(arr.uniq*2)
#=> [4]
Another example:
twice_at_most? [1, 4, 2, 4, 1, 3, 5]
#=> true
In my point of view, that could be a pretty simple solution:
def no_more_than_twice_occur?(array)
array.none? { |el| array.count(el) > 2 }
end
no_more_than_twice_occur?(["1", "3", "3", "55", "3", "2"]) # => false
no_more_than_twice_occur?(["20", "10", "20", "10"]) # => true
To avoid a lot of temporary overhead, just sort the array and then split it up into chunks of similar elements. You can then find the longest chunk:
def max_count(arr)
arr.sort.slice_when { |a,b| a != b }.map(&:length).max.to_i
end
max_count(%w[ 1 3 3 55 3 2 ])
# => 3
max_count(%w[ 1 3 55 3 2 ])
# => 2
max_count([ ])
# => 0
Just for fun here's one way using each_cons and using none? as used by Wayne Conrad in his answer.
arr.sort_by(&:to_i).each_cons(3).none? { |a,b,c| a == b && b == c }
Here's an all in one method for you.
def lessThanThree(arr)
arr.each_with_object(Hash.new(0)) { |element,counts| counts[element] += 1 }.values.max < 3
end
Basically, takes the array, iterates through creating the hash and counting each occurrence, then the values method just produces an array of all the counts (values) and then max finds the highest value. We check if that's less than three, if it is, return true, else return false. You could replace true or false with a code block.
For each unique item in the array, count how many times that element appears in the array. Of those values, check if the max is <= 2.
def max_occurence_at_most_2?(array)
array.uniq.map{ |element| array.count(element) }.max <= 2
end
Not optimized for speed.

How to find which items in a MASSIVE array appear more than once?

This is a very simple question; which items appear in the list more than once?
array = ["mike", "mike", "mike", "john", "john", "peter", "clark"]
The correct answer is ["mike", "john"].
Seems like we can just do:
array.select{ |e| ary.count(e) > 1 }.uniq
Problems solved. But wait! What if the array is REALLY big:
1_000_000.times { array.concat("1234567890abcdefghijklmnopqrstuvwxyz".split('')) }
It just so happens I need to figure out how to do this in a reasonable amount of time. We're talking millions and millions of records.
For what it's worth, this massive array is actually a sum of 10-20 smaller arrays. If it's easier to compare those, let me know - I'm stumped.
We're talking 10,000 to 10,000,000 lines per file, hundreds of files.
Does something like
items = 30_000_000
array = items.times.map do
rand(10_000_000)
end
puts "Done with seeding"
puts
puts "Checking what items appear more than once. Size: #{array.size}"
puts
t1 = Time.now
def more_than_once(array)
counts = Hash.new(0)
array.each do |item|
counts[item] += 1
end
counts.select do |_, count|
count > 1
end.keys
end
res = more_than_once(array)
t2 = Time.now
p res.size
puts "Took #{t2 - t1}"
work for you?
The duration is about 40s on my machine.
Here are two more solutions with a benchmark comparison of these and #Pascal's methods.
Use sets
require 'set'
def multi_set(arr)
s1 = Set.new
arr.each_with_object(Set.new) { |e, smulti| smulti.add(e) unless s1.add?(e) }.to_a
end
arr = ["mike", "mike", "mike", "john", "john", "peter", "clark"]
multi(arr)
#=> ["mike", "john"]
s1 is being built to include all distinct elements of arr. s1.add?(e) returns nil if s1 already contains e, in which case e is added to smulti if smulti does not already contain that element. (See Set#add?.) smulti is returned by the method.
Use Array#difference
Array#difference is a method I've proposed be added to Ruby's core. See also my answer here.
class Array
def difference(other)
h = other.each_with_object(Hash.new(0)) { |e,h| h[e] += 1 }
reject { |e| h[e] > 0 && h[e] -= 1 }
end
end
def multi_difference(arr)
arr.difference(arr.uniq).uniq
end
Benchmark
def more_than_once(arr)
counts = Hash.new { |hash, key| hash[key] = 0 }
arr.each do |item|
counts[item] += 1
end
counts.select do |_, count|
count > 1
end.keys
end
require 'fruity'
items = 30_000_000
arr = items.times.map { rand 10_000_000 }
compare do
Pascal { more_than_once(arr) }
Set { multi_set(arr) }
Difference { multi_difference(arr) }
end
Running each test once. Test will take about 4 minutes.
Pascal is faster than Set by 19.999999999999996% ± 10.0%
Set is faster than Difference by 30.000000000000004% ± 10.0%
Of course, difference, if part of the Ruby core, would be coded in C and optimized.

Resources