We are using the GroupLens dataset. It contains a collection of users, movies, and ratings collected from MovieLens (http://www.movielens.org), and is run by a number of academic researchers at the University of Minnesota.
We need to parse the ratings.dat file, delimited with a colon, for userids, ratings, and movieids. We can then match up movieids with those in movies.dat.
First, let's look at the code we need to build our index of movies:
package main
import (
// "github.com/aotimme/rbm"
"fmt"
"log"
"math"
"strconv"
"github.com/yunabe/easycsv"
g "gorgonia.org/gorgonia"
"gorgonia.org/tensor"
)
var datasetfilename string = "dataset/cleanratings.csv"
var movieindexfilename string = "dataset/cleanmovies.csv"
func BuildMovieIndex(input string) map[int]string {
var entrycount int
r := easycsv.NewReaderFile(input, easycsv.Option{
Comma: ',',
})
var entry struct {
Id int `index:"0"`
Title string `index:"1"`
}
//fix hardcode
movieindex := make(map[int]string, 3952)
for r.Read(&entry) {
// fmt.Println(entry)
movieindex[entry.Id] = entry.Title
// entries = append(entries, entry)
entrycount++
}
return movieindex
}
Now, we write a function to import the raw data and turn it into an m x n matrix. In this, the rows represent individual users and the columns are their (normalized) ratings across every movie in our dataset:
func DataImport(input string) (out [][]int, uniquemovies map[int]int) {
//
// Initial data processing
//
// import from CSV, read into entries var
r := easycsv.NewReaderFile(input, easycsv.Option{
Comma: ',',
})
var entry []int
var entries [][]int
for r.Read(&entry) {
entries = append(entries, entry)
}
// maps for if unique true/false
seenuser := make(map[int]bool)
seenmovie := make(map[int]bool)
// maps for if unique index
uniqueusers := make(map[int]int)
uniquemovies = make(map[int]int)
// counters for uniques
var uniqueuserscount int = 0
var uniquemoviescount int = 0
// distinct movie lists/indices
for _, e := range entries {
if seenmovie[e[1]] == false {
uniquemovies[uniquemoviescount] = e[1]
seenmovie[e[1]] = true
uniquemoviescount++
} else if seenmovie[e[1]] == true {
// fmt.Printf("Seen movie %v before, aborting ", e[0])
continue
}
}
// distinct user lists/indices
for _, e := range entries {
if seenuser[e[0]] == false {
uniqueusers[uniqueuserscount] = e[0]
seenuser[e[0]] = true
uniqueuserscount++
// uniqueusers[e[0]] =
} else if seenuser[e[0]] == true {
// fmt.Printf("Seen user %v before, aborting ", e[0])
continue
}
}
uservecs := make([][]int, len(uniqueusers))
for i := range uservecs {
uservecs[i] = make([]int, len(uniquemovies))
}
The following is the main loop where we process each line from the CSV, and then add to the master slices of users and sub-slices of movie ratings with the correct index:
var entriesloop int
for _, e := range entries {
// hack - wtf
if entriesloop%100000 == 0 && entriesloop != 0 {
fmt.Printf("Processing rating %v of %v ", entriesloop, len(entries))
}
if entriesloop > 999866 {
break
}
var currlike int
// normalisze ratings
if e[2] >= 4 {
currlike = 1
} else {
currlike = 0
}
// add to a user's vector of index e[1]/movie num whether current movie is +1
// fmt.Println("Now looping uniquemovies")
for i, v := range uniquemovies {
if v == e[1] {
// fmt.Println("Now setting uservec to currlike")
// uservec[i] = currlike
// fmt.Println("Now adding to uservecs")
uservecs[e[0]][i] = currlike
break
}
}
// fmt.Printf("Processing rating %v of %v ", entriesloop, len(entries))
entriesloop++
}
// fmt.Println(uservecs)
// os.Exit(1)
// fmt.Println(entry)
if err := r.Done(); err != nil {
log.Fatalf("Failed to read a CSV file: %v", err)
}
// fmt.Printf("length uservecs %v and uservecs.movies %v", len(uservecs))
fmt.Println("Number of unique users: ", len(seenuser))
fmt.Println("Number of unique movies: ", len(seenmovie))
out = uservecs
return
}