I have a file of huge size for example 100MB, I need to chunk it into 4 25MB files using golang.
The thing here is, if i use go routine and read the file, the order of the data inside the files are not preserved. the code i used is
package main
import (
"bufio"
"fmt"
"log"
"os"
"sync"
"github.com/google/uuid"
)
func main() {
file, err := os.Open("sampletest.txt")
if err != nil {
log.Fatal(err)
}
defer file.Close()
lines := make(chan string)
// start four workers to do the heavy lifting
wc1 := startWorker(lines)
wc2 := startWorker(lines)
wc3 := startWorker(lines)
wc4 := startWorker(lines)
scanner := bufio.NewScanner(file)
go func() {
defer close(lines)
for scanner.Scan() {
lines <- scanner.Text()
}
if err := scanner.Err(); err != nil {
log.Fatal(err)
}
}()
writefiles(wc1, wc2, wc3, wc4)
}
func writefile(data string) {
file, err := os.Create("chunks/" + uuid.New().String() + ".txt")
if err != nil {
fmt.Println(err)
}
defer file.Close()
file.WriteString(data)
}
func startWorker(lines <-chan string) <-chan string {
finished := make(chan string)
go func() {
defer close(finished)
for line := range lines {
finished <- line
}
}()
return finished
}
func writefiles(cs ...<-chan string) {
var wg sync.WaitGroup
output := func(c <-chan string) {
var d string
for n := range c {
d += n
d += "\n"
}
writefile(d)
wg.Done()
}
wg.Add(len(cs))
for _, c := range cs {
go output(c)
}
go func() {
wg.Wait()
}()
}
Here using this code my file got split into 4 equal files, but the order in it is not preserved.
I am very new to golang, any suggestions are highly appreciated.
I took this code from some site and tweaked here and there to meet my requirements.
I took this code from some site and tweaked here and there to meet my requirements.
Based on your statement, you should be able to modify the code from running concurrently to sequentially, it's faaar easier than applying concurrent aspect to existing code.
The work is basically just: remove the concurrent part.
Anyway, below is a simple example of how to achieve what you want. I use your code as the base, and then I remove everything related to concurrent process.
package main
import (
"bufio"
"fmt"
"log"
"os"
"strings"
"github.com/google/uuid"
)
func main() {
split := 4
file, err := os.Open("file.txt")
if err != nil {
log.Fatal(err)
}
defer file.Close()
scanner := bufio.NewScanner(file)
texts := make([]string, 0)
for scanner.Scan() {
text := scanner.Text()
texts = append(texts, text)
}
if err := scanner.Err(); err != nil {
log.Fatal(err)
}
lengthPerSplit := len(texts) / split
for i := 0; i < split; i++ {
if i+1 == split {
chunkTexts := texts[i*lengthPerSplit:]
writefile(strings.Join(chunkTexts, "\n"))
} else {
chunkTexts := texts[i*lengthPerSplit : (i+1)*lengthPerSplit]
writefile(strings.Join(chunkTexts, "\n"))
}
}
}
func writefile(data string) {
file, err := os.Create("chunks-" + uuid.New().String() + ".txt")
if err != nil {
fmt.Println(err)
}
defer file.Close()
file.WriteString(data)
}
Here is a simple file splitter. You can handle the leftovers yourself, I added the leftover bytes to 5th file.
package main
import (
"bufio"
"fmt"
"os"
)
func main() {
file, err := os.Open("sample-text-file.txt")
if err != nil {
panic(err)
}
defer file.Close()
// to divide file in four chunks
info, _ := file.Stat()
chunkSize := int(info.Size() / 4)
// reader of chunk size
bufR := bufio.NewReaderSize(file, chunkSize)
// Notice the range over slice of len 5, after 4 leftover will be written to 5th file
for i := range [5]int{} {
reader := make([]byte, chunkSize)
rlen, err := bufR.Read(reader)
fmt.Println("Read: ", rlen)
if err != nil {
panic(err)
}
writeFile(i, rlen, &reader)
}
}
// Notice bufW as a pointer to avoid exchange of big byte slices
func writeFile(i int, rlen int, bufW *[]byte) {
fname := fmt.Sprintf("file_%v", i)
f, err := os.Create(fname)
defer f.Close()
w := bufio.NewWriterSize(f, rlen)
wbytes := *(bufW)
wLen, err := w.Write(wbytes[:rlen])
if err != nil {
panic(err)
}
fmt.Println("Wrote ", wLen, "to", fname)
w.Flush()
}
Related
I have written a function to read zip archive tomap[string]*zip.File.
func ReadZip(file string) (map[string]*zip.File, error) {
r, err := zip.OpenReader(file)
if err != nil {
return nil, err
}
defer r.Close()
files := make(map[string]*zip.File)
for _, f := range r.File {
files[f.Name] = f
}
return files, nil
}
But when i try to open file infoRC, err := f["info.json"].Open() arises error
read file.zip: bad file descriptor.
Is there better way to read zip archive?
Once ReadCloser.Close is called, any of the *zip.File structs are invalid:
Close closes the Zip file, rendering it unusable for I/O.
You need to either:
Keep r open as long as you want to read the ZIP entries, or
make an in-memory/temporary file copy of all of the zip file contents
An example of the latter option:
func ReadZip(file string) (map[string][]byte, error) {
r, err := zip.OpenReader(file)
if err != nil {
return nil, err
}
defer r.Close()
files := make(map[string][]byte)
for _, f := range r.File {
fc, err := f.Open()
if err != nil {
return nil, err
}
contents, err := ioutil.ReadAll(fc)
fc.Close()
if err != nil {
return nil, err
}
files[f.Name] = contents
}
return files, nil
}
I'm writing a program that performs math on matrixes. I want to load them in from a csv file and have the following code:
file, err := os.Open("matrix1.csv")
if err != nil {
log.Fatal(err)
}
defer file.Close()
lines, _ := csv.NewReader(file).ReadAll()
for i, line := range lines {
for j, val := range line {
valInt, err := strconv.Atoi(val)
if err != nil {
log.Fatal(err)
}
matrix1[i][j] = valInt
}
}
However the strconv code is throwing an error:
strconv.ParseInt: parsing "": invalid syntax
It appears that everything else in the code is correct, does anyone have any ideas on how to solve this error?
EDIT: I'm now trying to work on outputting my result to a new csv file.
I have the following code:
file2, err := os.Create("result.csv")
if err != nil {
log.Fatal(err)
}
defer file1.Close()
writer := csv.NewWriter(file2)
for line2 := range blank {
writer.Write(line2)
}
}
}
This gives the following error:
cannot use line2 (type int) as type []string in argument to writer.Write
Updated with the suggestions from comments however the above error is now seen.
This means one of the cells of your CSV is blank, I reproduced the error with this code:
package main
import (
"encoding/csv"
"log"
"strconv"
"strings"
)
func main() {
matrix1 := [5][5]int{}
file := strings.NewReader("1,2,3,4,5\n6,7,8,,0")
lines, _ := csv.NewReader(file).ReadAll()
for i, line := range lines {
for j, val := range line {
valInt, err := strconv.Atoi(val)
if err != nil {
log.Fatal(err)
}
matrix1[i][j] = valInt
}
}
}
If you are ok with treating blank cells as 0 this will get you past the error:
func main() {
matrix1 := [5][5]int{}
file := strings.NewReader("1,2,3,4,5\n6,7,8,,0")
lines, _ := csv.NewReader(file).ReadAll()
for i, line := range lines {
for j, val := range line {
var valInt int
var err error
if val == "" {
valInt = 0
} else {
valInt, err = strconv.Atoi(val)
}
if err != nil {
log.Fatal(err)
}
matrix1[i][j] = valInt
}
}
}
As mentions in the comments above the error was due to a missing value in my csv file.
Once the file was amended the error is now gone.
When I try to save the map of type map[mapKey]string into a file using gob encoder, it is not saving string in file.
Here mapKey is struct and map value is long json string.
type mapKey struct{
Id1 string
Id2 string
}
And whenever I am use nested map instead of the struct like:
var m = make(map[string]map[string]string)
It is working fine and saving string properly. I am not sure what I am missing here.
Code to encode, decode and save it in file:
func Save(path string, object interface{}) error {
file, err := os.Create(path)
if err == nil {
encoder := gob.NewEncoder(file)
encoder.Encode(object)
}
file.Close()
return err
}
// Decode Gob file
func Load(path string, object interface{}) error {
file, err := os.Open(path)
if err == nil {
decoder := gob.NewDecoder(file)
err = decoder.Decode(object)
}
file.Close()
return err
}
func Check(e error) {
if e != nil {
_, file, line, _ := runtime.Caller(1)
fmt.Println(line, "\t", file, "\n", e)
os.Exit(1)
}
}
There is nothing special in encoding a value of type map[mapKey]string.
See this very simple working example which uses the specified reader/writer:
func save(w io.Writer, i interface{}) error {
return gob.NewEncoder(w).Encode(i)
}
func load(r io.Reader, i interface{}) error {
return gob.NewDecoder(r).Decode(i)
}
Testing it with an in-memory buffer (bytes.Buffer):
m := map[mapKey]string{
{"1", "2"}: "12",
{"3", "4"}: "34",
}
fmt.Println(m)
buf := &bytes.Buffer{}
if err := save(buf, m); err != nil {
panic(err)
}
var m2 map[mapKey]string
if err := load(buf, &m2); err != nil {
panic(err)
}
fmt.Println(m2)
Output as expected (try it on the Go Playground):
map[{1 2}:12 {3 4}:34]
map[{1 2}:12 {3 4}:34]
You have a working code, but know that you have to call Load() with a pointer value (else Decoder.Decode() wouldn't be able to modify its value).
Also a few things to improve it:
In your example you are swallowing the error returned by Encoder.Encode() (check it and you'll see what the problem is; a common problem is using a struct mapKey with no exported fields in which case an error of gob: type main.mapKey has no exported fields would be returned).
Also you should call File.Close() as a deferred function.
Also if opening the file fails, you should return early and you shouldn't close the file.
This is the corrected version of your code:
func Save(path string, object interface{}) error {
file, err := os.Create(path)
if err != nil {
return err
}
defer file.Close()
return gob.NewEncoder(file).Encode(object)
}
func Load(path string, object interface{}) error {
file, err := os.Open(path)
if err != nil {
return err
}
defer file.Close()
return gob.NewDecoder(file).Decode(object)
}
Testing it:
m := map[mapKey]string{
{"1", "2"}: "12",
{"3", "4"}: "34",
}
fmt.Println(m)
if err := Save("testfile", m); err != nil {
panic(err)
}
var m2 map[mapKey]string
if err := Load("testfile", &m2); err != nil {
panic(err)
}
fmt.Println(m2)
Output as expected:
map[{1 2}:12 {3 4}:34]
map[{1 2}:12 {3 4}:34]
I had this convenient function in Python:
def follow(path):
with open(self.path) as lines:
lines.seek(0, 2) # seek to EOF
while True:
line = lines.readline()
if not line:
time.sleep(0.1)
continue
yield line
It does something similar to UNIX tail -f: you get last lines of a file as they come. It's convenient because you can get the generator without blocking and pass it to another function.
Then I had to do the same thing in Go. I'm new to this language, so I'm not sure whether what I did is idiomatic/correct enough for Go.
Here is the code:
func Follow(fileName string) chan string {
out_chan := make(chan string)
file, err := os.Open(fileName)
if err != nil {
log.Fatal(err)
}
file.Seek(0, os.SEEK_END)
bf := bufio.NewReader(file)
go func() {
for {
line, _, _ := bf.ReadLine()
if len(line) == 0 {
time.Sleep(10 * time.Millisecond)
} else {
out_chan <- string(line)
}
}
defer file.Close()
close(out_chan)
}()
return out_chan
}
Is there any cleaner way to do this in Go? I have a feeling that using an asynchronous call for such a thing is an overkill, and it really bothers me.
Create a wrapper around a reader that sleeps on EOF:
type tailReader struct {
io.ReadCloser
}
func (t tailReader) Read(b []byte) (int, error) {
for {
n, err := t.ReadCloser.Read(b)
if n > 0 {
return n, nil
} else if err != io.EOF {
return n, err
}
time.Sleep(10 * time.Millisecond)
}
}
func newTailReader(fileName string) (tailReader, error) {
f, err := os.Open(fileName)
if err != nil {
return tailReader{}, err
}
if _, err := f.Seek(0, 2); err != nil {
return tailReader{}, err
}
return tailReader{f}, nil
}
This reader can be used anywhere an io.Reader can be used. Here's how loop over lines using bufio.Scanner:
t, err := newTailReader("somefile")
if err != nil {
log.Fatal(err)
}
defer t.Close()
scanner := bufio.NewScanner(t)
for scanner.Scan() {
fmt.Println(scanner.Text())
}
if err := scanner.Err(); err != nil {
fmt.Fprintln(os.Stderr, "reading:", err)
}
The reader can also be used to loop over JSON values appended to the file:
t, err := newTailReader("somefile")
if err != nil {
log.Fatal(err)
}
defer t.Close()
dec := json.NewDecoder(t)
for {
var v SomeType
if err := dec.Decode(&v); err != nil {
log.Fatal(err)
}
fmt.Println("the value is ", v)
}
There are a couple of advantages this approach has over the goroutine approach outlined in the question. The first is that shutdown is easy. Just close the file. There's no need to signal the goroutine that it should exit. The second advantage is that many packages work with io.Reader.
The sleep time can be adjusted up or down to meet specific needs. Decrease the time for lower latency and increase the time to reduce CPU use. A sleep of 100ms is probably fast enough for data that's displayed to humans.
Check out this Go package for reading from continuously updated files (tail -f): https://github.com/hpcloud/tail
t, err := tail.TailFile("filename", tail.Config{Follow: true})
for line := range t.Lines {
fmt.Println(line.Text)
}
I'm trying to map an array to a file via Mmap, the array could be any type, like float64. In C, I find this one. After reading some texts, I wrote this sample. I don't know if it is correct, and it is not writing the values to the file. If I increase the size of array a lot, e.g from 1000 to 10000, it crashes. If someone know how to do that in the correctly way, please, tell me.
Thanks!
For example, revising your sample program,
package main
import (
"fmt"
"os"
"syscall"
"unsafe"
)
func main() {
const n = 1e3
t := int(unsafe.Sizeof(0)) * n
map_file, err := os.Create("/tmp/test.dat")
if err != nil {
fmt.Println(err)
os.Exit(1)
}
_, err = map_file.Seek(int64(t-1), 0)
if err != nil {
fmt.Println(err)
os.Exit(1)
}
_, err = map_file.Write([]byte(" "))
if err != nil {
fmt.Println(err)
os.Exit(1)
}
mmap, err := syscall.Mmap(int(map_file.Fd()), 0, int(t), syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED)
if err != nil {
fmt.Println(err)
os.Exit(1)
}
map_array := (*[n]int)(unsafe.Pointer(&mmap[0]))
for i := 0; i < n; i++ {
map_array[i] = i * i
}
fmt.Println(*map_array)
err = syscall.Munmap(mmap)
if err != nil {
fmt.Println(err)
os.Exit(1)
}
err = map_file.Close()
if err != nil {
fmt.Println(err)
os.Exit(1)
}
}