1 / 42

Movie Voting Algorithm

This algorithm uses user ratings and movie properties to determine the voting confidence for a movie.

djohns
Download Presentation

Movie Voting Algorithm

An Image/Link below is provided (as is) to download presentation Download Policy: Content on the Website is provided to you AS IS for your information and personal use and may not be sold / licensed / shared on other websites without getting consent from its author. Content is provided to you AS IS for your information and personal use only. Download presentation by click this link. While downloading, if for some reason you are not able to download a presentation, the publisher may have deleted the file from their server. During download, if you can't get a presentation, the file might be deleted by the publisher.

E N D

Presentation Transcript


  1. Netflix data:{mk} k=1..17770 UserTable(uID,m1,...,m17770) UPTreeSet 3*17770 bitslices wide m1 ... mh . . . m17770 u1 : uk . . . u480189 m0,2 . . . m17769,0 u1 : uk . . . u480189 mk(u,r,d) avg:5655u/m uIDrating date u i1rmk,u dmk,u ui2 . . . ui n k rmhuk 1/0 Main:(m,u,r,d) avg:209m/u mIDuIDrating date m1 u 1 rm,u dm,u m1 u2 . . . m17770 u480189 r17770,480189 d17770,480189 or U2649429   47B     47B   -------- 100,480,507 -------- MovieTable(mID,u1...u480189) MPTreeSet 3*480189 bitslices wide u1 uk u480189 m1 : m h : m17770 u0,2 u480189,0 m1 : m h : m17770 rmhuk 0/1  47B     47B  

  2. extern double movie_vote(PredictionConfig *pcfg, // 2010_11_13 notes unsigned long int M, PTree & supportM, unsigned long int U, PTree & supportU) { auto double MU=Users.get_rating(U,M)-2, VOTE=DEFAULT_VOTE,VOTE_sum=0,VOTE_cnt=0,Nb,Mb,dsSq,UCor=1, supportUsize=supportU.get_count(),supportMsize=supportM.get_count(); struct pruning *internal_prune; struct external_prune *external_prune; auto PTree supM=supportM, supU=supportU; supM.clearbit(U); supU.clearbit(M); movie-vote.C ARM code1 /* External pruning: Prune Users supM */ external_prune = pcfg->get_movie_Prune_Users_in_SupM(); if (external_prune->enabled) { if(supM.get_count()>external_prune->params.Ct) do_pruning(external_prune, M, U, supM, supU); supM.clearbit(U); supU.clearbit(M); if((supM.get_count()<1)||(supU.get_count()<1)) return VOTE;} /* External pruning: Prune Movies supU */ external_prune = pcfg->get_movie_Prune_Movies_in_SupU(); if (external_prune->enabled) { if(supU.get_count()>external_prune->params.Ct ) do_pruning(external_prune, M, U, supM, supU); supM.clearbit(U); supU.clearbit(M); if((supM.get_count()<1) || (supU.get_count()<1) ) return VOTE; auto PTreeSet & U_ptree_set= Users.get_ptreeset(), & M_ptree_set=Movies.get_ptreeset(); supU.clearbit(M); supM.clearbit(U); auto PTree supU_1=supU&(~U_ptree_set[(U*3)+0])&( U_ptree_set[(U*3)+1])&( U_ptree_set[(U*3)+2]), supU_2=supU&( U_ptree_set[(U*3)+0])&(~U_ptree_set[(U*3)+1])&(~U_ptree_set[(U*3)+2]), supU_3=supU&( U_ptree_set[(U*3)+0])&(~U_ptree_set[(U*3)+1])&( U_ptree_set[(U*3)+2]), supU_4=supU&( U_ptree_set[(U*3)+0])&( U_ptree_set[(U*3)+1])&(~U_ptree_set[(U*3)+2]), supU_5=supU&( U_ptree_set[(U*3)+0])&( U_ptree_set[(U*3)+1])&( U_ptree_set[(U*3)+2]), supM_1=supM&(~M_ptree_set[(M*3)+0])&( M_ptree_set[(M*3)+1])&( M_ptree_set[(M*3)+2]), supM_2=supM&( M_ptree_set[(M*3)+0])&(~M_ptree_set[(M*3)+1])&(~M_ptree_set[(M*3)+2]), supM_3=supM&( M_ptree_set[(M*3)+0])&(~M_ptree_set[(M*3)+1])&( M_ptree_set[(M*3)+2]), supM_4=supM&( M_ptree_set[(M*3)+0])&( M_ptree_set[(M*3)+1])&(~M_ptree_set[(M*3)+2]), supM_5=supM&( M_ptree_set[(M*3)+0])&( M_ptree_set[(M*3)+1])&( M_ptree_set[(M*3)+2]), sou, souM, souU, som, somU, somM, spM, spU; auto double thr1, expnt1, thr2, expnt2, s, S, ss, sn, sM, sU, c, C, wt, XBalVT, wt_const=16; //SAMPLE-stat dMNsds pruning config parms hijacked for ARM parm use. internal_prune = pcfg->get_internal_prune(movie_dMNsds); thr1=internal_prune->threshold; expnt1=internal_prune->exponent; internal_prune = pcfg->get_internal_prune(movie_Nsds_Msds); thr2=internal_prune->threshold; expnt2=internal_prune->exponent; }

  3. auto unsigned long long int *supUlist_1=supU_1.get_indexes(); for ( unsigned long long int n = 0; n < supU_1.get_count(); ++n) //NLoop(voters) {auto unsigned long long int N=supUlist_1[n]; auto PTree supN = Movies.get_users(N), supN_1=supN&(~M_ptree_set[(N*3)+0])&( M_ptree_set[(N*3)+1])&( M_ptree_set[(N*3)+2]), supN_2=supN&( M_ptree_set[(N*3)+0])&(~M_ptree_set[(N*3)+1])&(~M_ptree_set[(N*3)+2]), supN_3=supN&( M_ptree_set[(N*3)+0])&(~M_ptree_set[(N*3)+1])&( M_ptree_set[(N*3)+2]), supN_4=supN&( M_ptree_set[(N*3)+0])&( M_ptree_set[(N*3)+1])&(~M_ptree_set[(N*3)+2]), supN_5=supN&( M_ptree_set[(N*3)+0])&( M_ptree_set[(N*3)+1])&( M_ptree_set[(N*3)+2]), csM1N1= supM_1 & supN_1, csM1N2= supM_1 & supN_2, csM1N3= supM_1 & supN_3, csM1N4= supM_1 & supN_4, csM1N5= supM_1 & supN_5; auto double NU = Users.get_rating(U,N)-2, sM1N1=csM1N1.get_count(), sN1=supN_1.get_count(), if(((sM1N1 > sN1*expnt1)) && ((sN1 > thr1))){VOTE_sum+=UCor*NU; VOTE_cnt+=UCor ;} sM1N2=csM1N2.get_count(), sN2=supN_2.get_count(), sM1N3=csM1N3.get_count(), sN3=supN_2.get_count(), adequate confidence sM1N4=csM1N4.get_count(), sN4=supN_4.get_count(), adequate support sM1N5=csM1N5.get_count(), sN5=supN_5.get_count(); if(((sM1N2 > sN2*.9 )) && ((sN2 > .9 ))){VOTE_sum+=UCor*NU; VOTE_cnt+=UCor ;} if(((sM1N3 > sN3*.9 )) && ((sN3 > .9 ))){VOTE_sum+=UCor*NU; VOTE_cnt+=UCor ;} if(((sM1N4 > sN4*.9 )) && ((sN4 > .9 ))){VOTE_sum+=UCor*NU; VOTE_cnt+=UCor ;} if(((sM1N5 > sN5*.9 )) && ((sN5 > .9 ))){VOTE_sum+=UCor*NU; VOTE_cnt+=UCor ;} Nk M1Nk M1 movie-vote.C ARM M1N1

  4. auto unsigned long long int *supUlist_2=supU_2.get_indexes(); for ( unsigned long long int o = n; o < supU_2.get_count(); ++o) //nested OLoop {auto unsigned long long int O=supUlist_1[o]; auto PTree supN = Movies.get_users(O), supO_2=supN&( M_ptree_set[(O*3)+0])&(~M_ptree_set[(O*3)+1])&(~M_ptree_set[(O*3)+2]), supO_3=supN&( M_ptree_set[(O*3)+0])&(~M_ptree_set[(O*3)+1])&( M_ptree_set[(O*3)+2]), supO_4=supN&( M_ptree_set[(O*3)+0])&( M_ptree_set[(O*3)+1])&(~M_ptree_set[(O*3)+2]), supO_5=supN&( M_ptree_set[(O*3)+0])&( M_ptree_set[(O*3)+1])&( M_ptree_set[(O*3)+2]), csM1N1O2= supM_1 & supN_1 & supO_2, csN1O1= supM_1 & supN_1 & supO_2, csM1N1O3= supM_1 & supN_1 & supO_3, csN1O1= supM_1 & supN_1 & supO_3, csM1N1O4= supM_1 & supN_1 & supO_4, csN1O1= supM_1 & supN_1 & supO_4, csM1N1O5= supM_1 & supN_1 & supO_5, csN1O1= supM_1 & supN_1 & supO_5; auto double OU = Users.get_rating(U,O)-2, sM1N1O2= csM1N1O2.get_count(), sN1O2= csN1O2.get_count() , sM1N1O3= csM1N1O3.get_count(), sN1O3= csN1O3.get_count() , sM1N1O4= csM1N1O4.get_count(), sN1O4= csN1O4.get_count() , sM1N1O5= csM1N1O5.get_count(), sN1O5= csN1O5.get_count() ; if(((sM1N1O2> sN1O2*expnt1))&& ((sN1O2> thr1))){VOTE_sum+=UCor*NU; VOTE_cnt+=UCor ;} if(((sM1N1O3> sN1O3*expnt1))&& ((sN1O3> thr1))){VOTE_sum+=UCor*NU; VOTE_cnt+=UCor ;} if(((sM1N1O4> sN1O4*expnt1))&& ((sN1O4> thr1))){VOTE_sum+=UCor*NU; VOTE_cnt+=UCor ;} if(((sM1N1O5> sN1O5*expnt1))&& ((sN1O5> thr1))){VOTE_sum+=UCor*NU; VOTE_cnt+=UCor ;} N1 M1N1O2 M1 N1O2 movie-vote.C ARM code3.1 O2

  5. auto PTree csM1N2O3= supM_1 & supN_2 & supO_3, csN2O1= supM_1 & supN_2 & supO_3, csM1N2O4= supM_1 & supN_2 & supO_4, csN2O1= supM_1 & supN_2 & supO_4, csM1N2O5= supM_1 & supN_2 & supO_5, csN2O1= supM_1 & supN_2 & supO_5; auto double sM1N2O3= csM1N2O3.get_count(), sN2O3= csN2O3.get_count() , sM1N2O4= csM1N2O4.get_count(), sN2O4= csN2O4.get_count() , sM1N2O5= csM1N2O5.get_count(), sN2O5= csN2O5.get_count() ; if(((sM1N2O3> sN2O3*expnt1))&& ((sN2O3> thr1))){VOTE_sum+=UCor*NU; VOTE_cnt+=UCor ;} if(((sM1N2O4> sN2O4*expnt1))&& ((sN2O4> thr1))){VOTE_sum+=UCor*NU; VOTE_cnt+=UCor ;} if(((sM1N2O5> sN2O5*expnt1))&& ((sN2O5> thr1))){VOTE_sum+=UCor*NU; VOTE_cnt+=UCor ;} N2 M1N1O2 M1 N2O3 movie-vote.C ARM code3.2 O3

  6. auto PTree csM1N3O4= supM_1 & supN_3 & supO_4, csN1O1= supM_1 & supN_3 & supO_4, csM1N3O5= supM_1 & supN_3 & supO_5, csN1O1= supM_1 & supN_3 & supO_5; auto double sM1N3O4= csM3N2O4.get_count(), sN3O4= csN3O4.get_count() , sM1N3O5= csM3N2O5.get_count(), sN3O5= csN3O5.get_count() ; if(((sM1N3O4> sN3O4*expnt1))&& ((sN3O4> thr1))){VOTE_sum+=UCor*NU; VOTE_cnt+=UCor ;} if(((sM1N3O5> sN3O5*expnt1))&& ((sN3O5> thr1))){VOTE_sum+=UCor*NU; VOTE_cnt+=UCor ;} movie-vote.C ARM code3.3

  7. auto PTree csM1N4O5= supM_1 & supN_4 & supO_5, csN4O1= supM_1 & supN_4 & supO_5; auto double sM1N4O5= csM1N4O5.get_count(), sN4O5= csN4O5.get_count() ; if(((sM1N4O5> sN4O5*expnt1))&& ((sN4O5> thr1))){VOTE_sum+=UCor*NU; VOTE_cnt+=UCor ;} movie-vote.C ARM code3.4

  8. movie-vote.C ARM code 4 /* Nearest Neighbor Code */ supU.clearbit(M); auto unsigned long long int *supUlist = supU.get_indexes(); for ( unsigned long long int n= 0; n < supU.get_count(); ++n) //NLOOP (voters) {auto unsigned long long int N=supUlist[n]; if (N == M) continue; auto double NU=Users.get_rating(U,N)-2,MAX=0,smN=0,smM=0,MM=0,MN=0,NN=0,denom=0,dm; auto PTree supN=Movies.get_users(N), csMN= supM & supN; csMN.clearbit(U); dm=csMN.get_count(); if(dm<1) continue; /* External pruning: PRUNE USERS CoSupMN */ external_prune = pcfg->get_movie_Prune_Users_in_CoSupMN(); if (external_prune->enabled) { if(csMN.get_count()>external_prune->params.Ct) do_pruning(external_prune,M,U,csMN,supU); csMN.clearbit(U); supU.clearbit(M); dm=csMN.get_count(); if(dm<1) continue;} /*Adjusted Cosine*/auto double ACCor,Vbar,ACCnum=0,ACCden,ACCdenSum1=0,ACCdenSum2=0; auto unsigned long long int *csMNlist=csMN.get_indexes(); for (unsigned long long int v= 0; v < csMN.get_count(); ++v){ //VLOOP (dims) auto unsigned long long int V= csMNlist[v]; auto double MV=Users.get_rating(V,M)-2, NV=Users.get_rating(V,N)-2; if(pow(MV-NV,2) > MAX) MAX=pow(MV-NV,2); smN+=NV; smM+=MV; MM+=MV*MV; MN+=NV*MV; NN+=NV*NV; ++denom; /* Adjusted Cosine code */ auto PTree supV=Users.get_movies(V); Vbar=Users.get_mean(V,supV); ACCnum+=(NV-Vbar)*(MV-Vbar); ACCdenSum1+=(NV-Vbar)*(NV-Vbar); ACCdenSum2+=(MV-Vbar)*(MV-Vbar); } //VLOOP ends

  9. movie-vote.C ARM5 /* Adjusted Cosine code */ ACCden=pow(ACCdenSum1,.5)*pow(ACCdenSum2,.5); ACCor=ACCnum/ACCden;UCor=ACCor;dm=csMN.get_count(); if(denom<1) continue; else {Nb=smN/dm; Mb=smM/dm; dsSq=NN-2*MN MM; VOTE=NU-Nb+Mb;} if (UCor>0) {VOTE_sum+=VOTE*UCor; VOTE_cnt+=UCor; } else continue; if ( pcfg->movie_vote_force_in_loop() ) { if ( (VOTE<1) && (VOTE!= DEFAULT_VOTE) ) VOTE=1; if ( (VOTE>5) && (VOTE!= DEFAULT_VOTE) ) VOTE=5; } } /* end NLOOP (movie voters) */ if ( VOTE_cnt>0 ) VOTE=VOTE_sum/VOTE_cnt; else VOTE=DEFAULT_VOTE; /* force_vote_after_Voter_Loop goes here. */ if ( pcfg->movie_vote_force_after_loop() ) { if ( (VOTE < 1) && (VOTE != DEFAULT_VOTE) ) VOTE=1; if ( (VOTE > 5) && (VOTE != DEFAULT_VOTE) ) VOTE=5; return VOTE; }

  10. movie-vote.C ARM code /** Public function. This function implements movie voting. * \param pcfg A pointer to the class containing the parameters which configure voting. * \param M The movie number for which a prediction is to be made * \param supportM The PTree identifying the support for the movie to be predicted. * \param U The identity number of the user for which a prediction is to be made. * \param supportU The Ptree identifying the support for the user who a predication is being made for. * \return The recommended prediction. */ extern double movie_vote(PredictionConfig *pcfg, unsigned long int M, \ PTree & supportM, unsigned long int U, PTree & supportU) { auto double MU=Users.get_rating(U,M)-2; //for PROBE-run diag print. Take out for QUAL-runs.) auto double VOTE=DEFAULT_VOTE, VOTE_sum=0, VOTE_cnt=0, Nb,Mb,dsSq,UCor=1, supportUsize=supportU.get_count(),supportMsize=supportM.get_count(); struct pruning *internal_prune; struct external_prune *external_prune; auto PTree supM = supportM, supU = supportU; supM.clearbit(U);supU.clearbit(M); /* External pruning: Prune Users supM */ external_prune = pcfg->get_movie_Prune_Users_in_SupM(); if (external_prune->enabled) {if(supM.get_count()>external_prune->params.Ct) do_pruning(external_prune, M, U, supM, supU); supM.clearbit(U); supU.clearbit(M); if ( (supM.get_count() < 1) || (supU.get_count() < 1) ) return VOTE; } /* Reset support if requested. */ if (pcfg->reset_movie_support()) {supU=supportU; supU.clearbit(M);} /* External pruning: Prune Movies supU */ external_prune = pcfg->get_movie_Prune_Movies_in_SupU(); if (external_prune->enabled) {if(supU.get_count()>external_prune->params.Ct ) do_pruning(external_prune, M, U, supM, supU); supM.clearbit(U); supU.clearbit(M); if( (supM.get_count() < 1) || (supU.get_count() < 1) ) return VOTE; }

  11. movie-vote.C ARM code 2 /** ARM Code **** * First an EXPLANATION of the ratings Ptree implemention: * Actual ratings are first translated from 1,2,3,4,5 to 3,4,5,6,7 (E.g., * rating=1 is implemented in Ptrees as rating=3), rating=2 as rating=4), * rating=3 is implemented in Ptrees as rating=5), rating=4 as rating=6), * rating=5 is implemented in Ptrees as rating=7). * This design decision was made so that rating=0 (which means "not rated" * and does NOT mean "the very lowest rating") would be at a maximum separation * from the lowest true rating, yet all ratings could still be implemented with * 3 Ptrees (all rating values are 3-bit numbers). Thus, Ptrees represent ratings as * 0=000 (movie not rated by user), 3=011 (very low or a 1-star rating), * 4=100 (low or a 2-star rating), 5=101 (average or a 3-star rating), * 6=110 (high or a 4-star rating), 7=111 (very high or a 5-star rating). * We also partition (cluster) the movies in the support of user predictee, U, by rating. * This is done so that we can restrict to only those pertinent user predictor voters * for each rating value in our ARM code (don't need to loop through all voters, * e.g., for ARM done on rating=1 relationship, but only user that predict 1 for M)*/ auto PTreeSet & U_ptree_set=Users.get_ptreeset(), & M_ptree_set=Movies.get_ptreeset(); supU.clearbit(M); supM.clearbit(U); auto PTree supU_1=supU& (~U_ptree_set[(U*3)+0])& ( U_ptree_set[(U*3)+1])& ( U_ptree_set[(U*3)+2]), supU_2=supU& ( U_ptree_set[(U*3)+0])& (~U_ptree_set[(U*3)+1])& (~U_ptree_set[(U*3)+2]), supU_3=supU& ( U_ptree_set[(U*3)+0])& (~U_ptree_set[(U*3)+1])& ( U_ptree_set[(U*3)+2]), supU_4=supU& ( U_ptree_set[(U*3)+0])& ( U_ptree_set[(U*3)+1])& (~U_ptree_set[(U*3)+2]), supU_5=supU& ( U_ptree_set[(U*3)+0])& ( U_ptree_set[(U*3)+1])& ( U_ptree_set[(U*3)+2]), supM_1=supM& (~M_ptree_set[(M*3)+0])& ( M_ptree_set[(M*3)+1])& ( M_ptree_set[(M*3)+2]), supM_2=supM& ( M_ptree_set[(M*3)+0])& (~M_ptree_set[(M*3)+1])& (~M_ptree_set[(M*3)+2]), supM_3=supM& ( M_ptree_set[(M*3)+0])& (~M_ptree_set[(M*3)+1])& ( M_ptree_set[(M*3)+2]), supM_4=supM& ( M_ptree_set[(M*3)+0])& ( M_ptree_set[(M*3)+1])& (~M_ptree_set[(M*3)+2]), supM_5=supM& ( M_ptree_set[(M*3)+0])& ( M_ptree_set[(M*3)+1])& ( M_ptree_set[(M*3)+2]), sou, souM, souU, som, somU, somM, spM, spU; auto double thr1,expnt1,thr2,expnt2,s,S,ss,sn,sM,sU,c,C,wt,XBalVT,wt_const=16;

  12. movie-vote.C ARM code 3 /* Association Rule Mining (ARM) to enhance Movie Votes: *For each rating value, k=1,2,3,4,5, we consider in turn only movie voters that are rated k by U. *We loop through those movies, N, for which rating(N,U)=k, one k=1,2,3,4,5 at at time; *looking for strong Rules, N-->M (the N_arm_k LOOP, k=1,2,3,4,5), *then for strong Rules, N,O-->M (the NO_arm_k LOOP, k=1,2,3,4,5), *then for strong Rules, N,O,P-->M (the NOP_arm_k LOOP, k=1,2,3,4,5), *then for strong Rules, N,O,P,Q-->M (the NOPQ_arm_k LOOP, k=1,2,3,4,5), *then for strong Rules, N,O,P,Q,R-->M (the NOPQR_arm_k LOOP, k=1,2,3,4,5), *then for strong Rules, N,O,P,Q,R,S-->M (the NOPQRS_arm_k LOOP, k=1,2,3,4,5), *then for strong Rules, N,O,P,Q,R,S,T-->M (the NOPQRST_arm_k LOOP, k=1,2,3,4,5), * When a strong rule is found, we issue "bak" ballots to the antecedent set to vote for k. * We allow "shifting" votes from "k" with "vsk" (vote shift). Thus the parameters are: *spk (min Rating=1 ARM support threshold), *cfk (minimum Rating=1 ARM confidence threshold), *bak (num of ballot issued to Ratings=1_Strong_ARM antecedent set), vsk (shift for vote=1). * For each rating, k=12345 doing a separate "rating=k Assoc. Rule Mine (for strong rules * with consequent={M}, antecedent=a_movie set. For each strong rule extra votes are given and * vote shift allowed to optimize benefit. Each rule support has its own loop. If individual * ARM_supports are not allowed, downward closure of ARM support can be applied. */ //SAMPLE-statistic-based dMNsds pruning config parameters hijacked here for ARM parm use. internal_prune = pcfg->get_internal_prune(movie_dMNsds); thr1=internal_prune->threshold; expnt1=internal_prune->exponent; internal_prune = pcfg->get_internal_prune(movie_Nsds_Msds); thr2=internal_prune->threshold; expnt2=internal_prune->exponent;

  13. movie-vote.C ARM code 4 #if 1 // ARM(pre 5/1/10) #if 1 // Movie_ARM Rating = 1 #if 1 // N Movie_ARM Rating = 1 auto unsigned long long int *supUlist_1=supU_1.get_indexes(); for (unsigned long long int n = 0; n < supU_1.get_count(); ++n) { auto unsigned long long int N=supUlist_1[n]; auto PTree supN = Movies.get_users(N), supN_1 = supN & (~M_ptree_set[(N*3)+0]) & ( M_ptree_set[(N*3)+1]) & ( M_ptree_set[(N*3)+2]), supN_2 = supN & ( M_ptree_set[(N*3)+0]) & (~M_ptree_set[(N*3)+1]) & (~M_ptree_set[(N*3)+2]), supN_3 = supN & ( M_ptree_set[(N*3)+0]) & (~M_ptree_set[(N*3)+1]) & ( M_ptree_set[(N*3)+2]), supN_4 = supN & ( M_ptree_set[(N*3)+0]) & ( M_ptree_set[(N*3)+1]) & (~M_ptree_set[(N*3)+2]), supN_5 = supN & ( M_ptree_set[(N*3)+0]) & ( M_ptree_set[(N*3)+1]) & ( M_ptree_set[(N*3)+2]), csMN_1 = supM_1 & supN_1, csMN_2 = supM_2 & supN_2, csMN_3 = supM_3 & supN_3, csMN_4 = supM_4 & supN_4, csMN_5 = supM_5 & supN_5; #if 1 // vote code auto double NU = Users.get_rating(U,N)-2, sMNN1= csMN_1.get_count(), sNN1= supN_1.get_count() , sMNN2= csMN_2.get_count(), sNN2= supN_2.get_count() , sMNN3= csMN_3.get_count(), sNN3= supN_3.get_count() , sMNN4= csMN_4.get_count(), sNN4= supN_4.get_count() , sMNN5= csMN_5.get_count(), sNN5= supN_5.get_count() , sMNNn1= sMNN2+sMNN3+sMNN4+sMNN5, sNNn1= sNN2+sNN3+sNN4+sNN5; if ( ( (sMNNn1 > sNNn1 * expnt1 ) ) && ( ( sMNN1 > sNN1 * thr1 ) ) ) { VOTE_sum += UCor * NU ; VOTE_cnt += UCor ; } #endif // vote code #endif // N Rating = 1 }

  14. movie-vote.C ARM code 5 #if 1 // Nearest Neighbor Code supU.clearbit(M); auto unsigned long long int *supUlist = supU.get_indexes(); for (unsigned long long int n = 0; n < supU.get_count(); ++n)//NLOOP (Ns are movie voters) {auto unsigned long long int N=supUlist[n]; if (N == M) continue; auto double NU=Users.get_rating(U,N)-2,MAX=0,smN=0,smM=0,MM=0,MN=0,NN=0,denom=0,dm; auto PTree supN=Movies.get_users(N), csMN= supM & supN; csMN.clearbit(U); dm=csMN.get_count(); if(dm<1) continue; /* External pruning: PRUNE USERS CoSupMN */ external_prune = pcfg->get_movie_Prune_Users_in_CoSupMN(); if (external_prune->enabled) {if(csMN.get_count()>external_prune->params.Ct) do_pruning(external_prune,M,U,csMN,supU); csMN.clearbit(U); supU.clearbit(M); dm = csMN.get_count(); if( dm < 1) continue;} /* Adjusted Cosine declarations */ auto double ACCor,Vbar,ACCnum=0, ACCden, ACCdenSum1=0, ACCdenSum2=0; /* NV: VLOOP (Vs are user dimensions) */ auto unsigned long long int *csMNlist = csMN.get_indexes(); for (unsigned long long int v= 0; v < csMN.get_count(); ++v) { auto unsigned long long int V=csMNlist[v]; auto double MV=Users.get_rating(V,M)-2, NV=Users.get_rating(V,N)-2; if(pow(MV-NV,2) > MAX) MAX=pow(MV-NV,2); smN+=NV; smM+=MV; MM+=MV*MV; MN+=NV*MV; NN+=NV*NV; ++denom; /* Adjusted Cosine code */ auto PTree supV=Users.get_movies(V); Vbar=Users.get_mean(V,supV); ACCnum+=(NV-Vbar)*(MV-Vbar); ACCdenSum1+=(NV-Vbar)*(NV-Vbar); ACCdenSum2+=(MV-Vbar)*(MV-Vbar); }//VLOOP ends

  15. movie-vote.C ARM code 6 /* Adjusted Cosine code */ ACCden=pow(ACCdenSum1,.5)*pow(ACCdenSum2,.5); ACCor=ACCnum/ACCden;UCor=ACCor;dm=csMN.get_count(); if(denom<1) continue; else {Nb=smN/dm; Mb=smM/dm; dsSq=NN-2*MN MM; VOTE=NU-Nb+Mb;} /* force_vote_in_Voter_Loop goes here. */ if ( pcfg->movie_vote_force_in_loop() ) { if ((VOTE<1) && (VOTE!=DEFAULT_VOTE)) VOTE=1; if ((VOTE>5) && (VOTE!=DEFAULT_VOTE)) VOTE=5; } /* SAMPLE-statistic-based pruning thru early exit */ if( dm > 1 ) { internal_prune = pcfg->get_internal_prune(movie_dMNsds); if ( internal_prune->enabled ) { auto double dMNsds, thr = internal_prune->threshold, expnt = internal_prune->exponent; dMNsds = pow((dsSq-dm*(Nb-Mb)*(Nb-Mb))/(dm-1), 0.5); //if(dMNsds>thr)continue; hijacking dMNsds expnt as UCor exponent if(UCor>=0)UCor=pow(UCor,expnt); } internal_prune = pcfg->get_internal_prune(movie_Nsds_Msds); if(internal_prune->enabled) { auto double Msds, Nsds, thr=internal_prune->threshold; Msds = pow((MM-dm*Mb*Mb)/(dm-1), 0.5); Nsds = pow((NN-dm*Nb*Nb)/(dm-1), 0.5); if ( Nsds > (thr * Msds) ) continue; } internal_prune = pcfg->get_internal_prune(movie_DVCors); if ( internal_prune->enabled ) { auto double Msds,Nsds,DVCors, thr=internal_prune->threshold, expnt=internal_prune->exponent; Msds=pow(dm*MM-smM*smM,.5)/dm; Nsds=pow(dm*NN-smN*smN,.5)/dm; DVCors=exp(expnt*(Nsds-Msds)*(Nsds-Msds)); if(DVCors<thr)continue; if(internal_prune->weight)UCor=DVCors; } internal_prune = pcfg->get_internal_prune(movie_VDCors); if ( internal_prune->enabled ) { auto double VDCors,dMNsds,thr=internal_prune->threshold,expnt=internal_prune->exponent; dMNsds=pow((dsSq-dm*(Nb-Mb)*(Nb-Mb))/(dm-1), .5); VDCors=exp(expnt*dMNsds*dMNsds); if ( VDCors < thr ) continue; if ( internal_prune->weight ) UCor = VDCors; } } // end of SAMPLE-statistic-based pruning through early

  16. /*POPULATION-statistics pruning thru early exit*/ if(dm>0){ internal_prune=pcfg->get_internal_prune(movie_dMNsdp); if(internal_prune->enabled){ auto double dMNsdp,thr=internal_prune->threshold, expnt=internal_prune->exponent; dMNsdp=pow(dm*dsSq-(smN-smM)*(smN-smM),.5)/dm; dMNsdp=pow(-expnt*dMNsdp,2); ifdMNsdp>thr) continue;} internal_prune = pcfg->get_internal_prune(movie_Nsdp_Msdp); if (internal_prune->enabled){ auto double Nsdp,Msdp,thr=internal_prune->threshold; Msdp=pow(dm*MM-smM*smM,.5)/dm; Nsdp=pow(dm*NN-smN*smN,.5)/dm; if( Nsdp > (thr * Msdp))continue; } internal_prune = pcfg->get_internal_prune(movie_DVCorp); if(internal_prune->enabled){ auto double DVCorp,Msdp,Nsdp,thr=internal_prune->threshold,expnt=internal_prune->exponent; Msdp=pow(dm*MM-smM*smM,.5)/dm;Nsdp=pow(dm*NN-smN*smN,.5)/dm; DVCorp=exp(expnt*(Nsdp-Msdp)*(Nsdp-Msdp)); if ( DVCorp<thr) continue; if ( internal_prune->weight ) UCor = DVCorp; } internal_prune = pcfg->get_internal_prune(movie_VDCorp); if (internal_prune->enabled){ auto double VDCorp,dMNsdp,thr=internal_prune->threshold,expnt=internal_prune->exponent; dMNsdp=pow(dm*dsSq-(smN-smM)*(smN-smM),.5)/dm; VDCorp =exp(expnt*dMNsdp*dMNsdp); if ( VDCorp < thr ) continue; if ( internal_prune->weight ) UCor = VDCorp; } internal_prune = pcfg->get_internal_prune(movie_SCor); if (internal_prune->enabled){ auto double SCor, thr=internal_prune->threshold; SCor=(MN-dm*Mb*Nb)/(.0001+(pow((MM-dm*pow(Mb,2)),.5))*(.0001+pow((NN-dm*pow(Nb,2)),.5))); if ( SCor < thr ) continue; if ( internal_prune->weight ) UCor = SCor; } internal_prune = pcfg->get_internal_prune(movie_PCor); if (internal_prune->enabled){ auto double ONEPDS,PCor=1,thr=internal_prune->threshold;ONEPDS=dsSq-dm*pow(Nb-Mb,2); if ( MAX > 0 ) PCor = exp(-0.1 * ONEPDS / (pow(MAX, .75) * pow(dm,.5))); if( PCor < thr ) continue; if ( internal_prune->weight ) UCor = PCor; } internal_prune = pcfg->get_internal_prune(movie_DCor); if(internal_prune->enabled){ auto double DCor,ONEPDS,thr=internal_prune->threshold; ONEPDS=dsSq-dm*pow(Nb-Mb,2); DCor=exp(-dsSq/100); if ( DCor < thr ) continue; if ( internal_prune->weight ) UCor = DCor; } } // POPULATION-statistics-based pruning through early exit ends here. if ( UCor > 0 ) { VOTE_sum+=VOTE*UCor; VOTE_cnt+=UCor; } else continue; /* force_vote_in_Voter_Loop goes here. */ if ( pcfg->movie_vote_force_in_loop() ) { if ( (VOTE<1) && (VOTE!= DEFAULT_VOTE) ) VOTE=1; if ((VOTE>5) && (VOTE!=DEFAULT_VOTE)) VOTE=5; } } /* ends NV NLOOP (movie voter loop) */ #endif //Nearest Neighbor Code if ( VOTE_cnt > 0 ) VOTE=VOTE_sum/VOTE_cnt; else VOTE = DEFAULT_VOTE; /* force_vote_after_Voter_Loop goes here. */ if ( pcfg->movie_vote_force_after_loop() ) { if ( (VOTE < 1) && (VOTE != DEFAULT_VOTE) ) VOTE=1; if ( (VOTE > 5) && (VOTE != DEFAULT_VOTE) ) VOTE=5; return VOTE;} movie-vote.C ARM code 7

  17. movie-vote.C ARM code 8 if (UCor>0) {VOTE_sum+=VOTE*UCor; VOTE_cnt+=UCor; } else continue; /* force_vote_in_Voter_Loop goes here. */ if ( pcfg->movie_vote_force_in_loop() ) { if ( (VOTE<1) && (VOTE!= DEFAULT_VOTE) ) VOTE=1; if ( (VOTE>5) && (VOTE!= DEFAULT_VOTE) ) VOTE=5; } } /* ends NV NLOOP (movie voter loop) */ #endif //Nearest Neighbor Code if ( VOTE_cnt>0 ) VOTE=VOTE_sum/VOTE_cnt; else VOTE=DEFAULT_VOTE; /* force_vote_after_Voter_Loop goes here. */ if ( pcfg->movie_vote_force_after_loop() ) { if ( (VOTE < 1) && (VOTE != DEFAULT_VOTE) ) VOTE=1; if ( (VOTE > 5) && (VOTE != DEFAULT_VOTE) ) VOTE=5; return VOTE; }

  18. createconfigs script in src/mpp-mpred-3.2.0/p95/mu11 #!/bin/bash for g in .1 .2 .4 .7 .9 do sed -i -e "s/dMNsdsThr=[^ ]*/dMNsdsThr=$g/" t.config for h in .1 .2 .4 .7 .9 do sed -i -e "s/dMNsdsExp=[^ ]*/dMNsdsExp=$h/" t.config cp t.config configs/a$g$h.config done done submitin src/mpp-mpred-3.2.0 produces here #!/bin/bash for g in .1 .2 .4 .7 .9 do for h in .1 .2 .4 .7 .9 do ./mpp-submit -S -i Data/p95test.txt -c p95/mu11/configs a$g$h.out -t .05 -d ./p95/mu11 done done -rw-r--r-- 1 perrizo faculty 3626 Nov 3 10:15 a.1.1.out -rw-r--r-- 1 perrizo faculty 3626 Nov 3 10:15 a.1.2.out -rw-r--r-- 1 perrizo faculty 3626 Nov 3 10:15 a.1.4.out -rw-r--r-- 1 perrizo faculty 3626 Nov 3 10:15 a.1.7.out -rw-r--r-- 1 perrizo faculty 3625 Nov 3 10:15 a.1.9.out -rw-r--r-- 1 perrizo faculty 3626 Nov 3 10:15 a.2.1.out -rw-r--r-- 1 perrizo faculty 3626 Nov 3 10:15 a.2.2.out -rw-r--r-- 1 perrizo faculty 3626 Nov 3 10:16 a.2.4.out -rw-r--r-- 1 perrizo faculty 3626 Nov 3 10:16 a.2.7.out -rw-r--r-- 1 perrizo faculty 3626 Nov 3 10:16 a.2.9.out -rw-r--r-- 1 perrizo faculty 3626 Nov 3 10:16 a.4.1.out -rw-r--r-- 1 perrizo faculty 3625 Nov 3 10:16 a.4.2.out -rw-r--r-- 1 perrizo faculty 3626 Nov 3 10:16 a.4.4.out -rw-r--r-- 1 perrizo faculty 3626 Nov 3 10:16 a.4.7.out -rw-r--r-- 1 perrizo faculty 3626 Nov 3 10:17 a.4.9.out -rw-r--r-- 1 perrizo faculty 3626 Nov 3 10:17 a.7.1.out -rw-r--r-- 1 perrizo faculty 3625 Nov 3 10:17 a.7.2.out -rw-r--r-- 1 perrizo faculty 3626 Nov 3 10:17 a.7.4.out -rw-r--r-- 1 perrizo faculty 3626 Nov 3 10:17 a.7.7.out -rw-r--r-- 1 perrizo faculty 3626 Nov 3 10:17 a.7.9.out -rw-r--r-- 1 perrizo faculty 3626 Nov 3 10:17 a.9.1.out -rw-r--r-- 1 perrizo faculty 3626 Nov 3 10:18 a.9.2.out -rw-r--r-- 1 perrizo faculty 3626 Nov 3 10:18 a.9.4.out -rw-r--r-- 1 perrizo faculty 3625 Nov 3 10:18 a.9.7.out -rw-r--r-- 1 perrizo faculty 3626 Nov 3 10:18 a.9.9.out which I then copy to src/mpp-mpred-3.2.0/dotouts. creates in src.mpp-mpred-3.2.0/p95/mu11/configs: -rw-r--r-- 1 perrizo faculty 7441 Nov 3 10:11 a.1.1.config -rw-r--r-- 1 perrizo faculty 7441 Nov 3 10:11 a.1.2.config -rw-r--r-- 1 perrizo faculty 7441 Nov 3 10:11 a.1.4.config -rw-r--r-- 1 perrizo faculty 7441 Nov 3 10:11 a.1.7.config -rw-r--r-- 1 perrizo faculty 7441 Nov 3 10:11 a.1.9.config -rw-r--r-- 1 perrizo faculty 7441 Nov 3 10:11 a.2.1.config -rw-r--r-- 1 perrizo faculty 7441 Nov 3 10:11 a.2.2.config -rw-r--r-- 1 perrizo faculty 7441 Nov 3 10:11 a.2.4.config -rw-r--r-- 1 perrizo faculty 7441 Nov 3 10:11 a.2.7.config -rw-r--r-- 1 perrizo faculty 7441 Nov 3 10:11 a.2.9.config -rw-r--r-- 1 perrizo faculty 7441 Nov 3 10:12 a.4.1.config -rw-r--r-- 1 perrizo faculty 7441 Nov 3 10:12 a.4.2.config -rw-r--r-- 1 perrizo faculty 7441 Nov 3 10:12 a.4.4.config -rw-r--r-- 1 perrizo faculty 7441 Nov 3 10:12 a.4.7.config -rw-r--r-- 1 perrizo faculty 7441 Nov 3 10:12 a.4.9.config -rw-r--r-- 1 perrizo faculty 7441 Nov 3 10:12 a.7.1.config -rw-r--r-- 1 perrizo faculty 7441 Nov 3 10:12 a.7.2.config -rw-r--r-- 1 perrizo faculty 7441 Nov 3 10:12 a.7.4.config -rw-r--r-- 1 perrizo faculty 7441 Nov 3 10:12 a.7.7.config -rw-r--r-- 1 perrizo faculty 7441 Nov 3 10:12 a.7.9.config -rw-r--r-- 1 perrizo faculty 7441 Nov 3 10:12 a.9.1.config -rw-r--r-- 1 perrizo faculty 7441 Nov 3 10:12 a.9.2.config -rw-r--r-- 1 perrizo faculty 7441 Nov 3 10:12 a.9.4.config -rw-r--r-- 1 perrizo faculty 7441 Nov 3 10:12 a.9.7.config -rw-r--r-- 1 perrizo faculty 7441 Nov 3 10:12 a.9.9.config

  19. submit script run in scr/mpp-mpred-3.2.0 also produces these subdirs in mpp-mpred-3.2.0 : p95test.txt.rmse Movie: 12641: 0: Answer: 1 Prediction: 1.22 Error: 0.04840 1: Answer: 4 Prediction: 3.65 Error: 0.12250 2: Answer: 2 Prediction: 2.55 Error: 0.30250 3: Answer: 4 Prediction: 4.04 Error: 0.00160 4: Answer: 2 Prediction: 1.85 Error: 0.02250 Sum: 0.49750 Total: 5 RMSE: 0.315436 Running RMSE: 0.315436 / 5 predictions Movie: 12502: 0: Answer: 4 Prediction: 4.71 Error: 0.50410 1: Answer: 5 Prediction: 3.54 Error: 2.13160 2: Answer: 5 Prediction: 3.87 Error: 1.27690 3: Answer: 3 Prediction: 3.33 Error: 0.10890 4: Answer: 2 Prediction: 2.97 Error: 0.94090 Sum: 4.96240 Total: 5 RMSE: 0.996233 Running RMSE: 0.738911 / 10 predictions . . . Movie: 10811: 0: Answer: 5 Prediction: 4.05 Error: 0.90250 1: Answer: 3 Prediction: 3.49 Error: 0.24010 2: Answer: 4 Prediction: 3.94 Error: 0.00360 3: Answer: 3 Prediction: 3.39 Error: 0.15210 Sum: 1.29830 Total: 4 RMSE: 0.569715 Running RMSE: 0.964397 / 743 predictions Movie: 12069: 0: Answer: 4 Prediction: 3.20 Error: 0.64000 1: Answer: 3 Prediction: 3.48 Error: 0.23040 Sum: 0.87040 Total: 2 RMSE: 0.659697 Prediction summary: Sum: 691.90610 Total: 745 RMSE: 0.963708 .predictions 12641: 1.22 3.65 2.55 4.04 1.85 12502: 4.71 3.54 3.87 3.33 2.97 . . . 10811: 4.05 3.49 3.94 3.39 12069: 3.20 3.48 drwxr-xr-x 2 perrizo faculty 4096 Nov 3 10:15 a.1.1 drwxr-xr-x 2 perrizo faculty 4096 Nov 3 10:15 a.1.2 drwxr-xr-x 2 perrizo faculty 4096 Nov 3 10:15 a.1.4 drwxr-xr-x 2 perrizo faculty 4096 Nov 3 10:15 a.1.7 drwxr-xr-x 2 perrizo faculty 4096 Nov 3 10:15 a.1.9 drwxr-xr-x 2 perrizo faculty 4096 Nov 3 10:15 a.2.1 drwxr-xr-x 2 perrizo faculty 4096 Nov 3 10:15 a.2.2 drwxr-xr-x 2 perrizo faculty 4096 Nov 3 10:16 a.2.4 drwxr-xr-x 2 perrizo faculty 4096 Nov 3 10:16 a.2.7 drwxr-xr-x 2 perrizo faculty 4096 Nov 3 10:16 a.2.9 drwxr-xr-x 2 perrizo faculty 4096 Nov 3 10:16 a.4.1 drwxr-xr-x 2 perrizo faculty 4096 Nov 3 10:16 a.4.2 drwxr-xr-x 2 perrizo faculty 4096 Nov 3 10:16 a.4.4 drwxr-xr-x 2 perrizo faculty 4096 Nov 3 10:16 a.4.7 drwxr-xr-x 2 perrizo faculty 4096 Nov 3 10:17 a.4.9 drwxr-xr-x 2 perrizo faculty 4096 Nov 3 10:17 a.7.1 drwxr-xr-x 2 perrizo faculty 4096 Nov 3 10:17 a.7.2 drwxr-xr-x 2 perrizo faculty 4096 Nov 3 10:17 a.7.4 drwxr-xr-x 2 perrizo faculty 4096 Nov 3 10:17 a.7.7 drwxr-xr-x 2 perrizo faculty 4096 Nov 3 10:17 a.7.9 drwxr-xr-x 2 perrizo faculty 4096 Nov 3 10:17 a.9.1 drwxr-xr-x 2 perrizo faculty 4096 Nov 3 10:18 a.9.2 drwxr-xr-x 2 perrizo faculty 4096 Nov 3 10:18 a.9.4 drwxr-xr-x 2 perrizo faculty 4096 Nov 3 10:18 a.9.7 drwxr-xr-x 2 perrizo faculty 4096 Nov 3 10:18 a.9.9 and e.g., a.9.9 contains: -rw-r--r-- 1 perrizo faculty 7441 Nov 3 10:17 a.9.9.config -rw-r--r-- 1 perrizo faculty 5191 Nov 3 10:18 hi-a.9.9.txt -rw-r--r-- 1 perrizo faculty 1808 Nov 3 10:18 hi-a.9.9.txt.answers -rw-r--r-- 1 perrizo faculty 1465 Nov 3 10:18 lo-a.9.9.txt -rw-r--r-- 1 perrizo faculty 688 Nov 3 10:18 lo-a.9.9.txt.answers -rw-r--r-- 1 perrizo faculty 4330 Nov 3 10:18 p95test.txt.predictions -rw-r--r-- 1 perrizo faculty 46147Nov 3 10:18 p95test.txt.rmse

  20. In dotouts is a script, createtablejob: #!/bin/bash for g in .1 .2 .4 .7 .9 do for h in .1 .2 .4 .7 .9 do grep Input:\ \ \ lo a$g$h.out >> job done done In dotouts is a script, createtablermse: #!/bin/bash for g in .1 .2 .4 .7 .9 do for h in .1 .2 .4 .7 .9 do grep RMSE:\ a$g$h.out >> rmse done done Sum: 692.82510 Total: 745 RMSE: 0.964348 Sum: 691.59330 Total: 745 RMSE: 0.963490 Sum: 691.90610 Total: 745 RMSE: 0.963708 Sum: 691.90610 Total: 745 RMSE: 0.963708 Sum: 691.90610 Total: 745 RMSE: 0.963708 Sum: 691.84690 Total: 745 RMSE: 0.963667 Sum: 690.47330 Total: 745 RMSE: 0.962710 Sum: 691.90610 Total: 745 RMSE: 0.963708 Sum: 691.90610 Total: 745 RMSE: 0.963708 Sum: 691.90610 Total: 745 RMSE: 0.963708 Sum: 693.27970 Total: 745 RMSE: 0.964664 Sum: 691.90610 Total: 745 RMSE: 0.963708 Sum: 691.90610 Total: 745 RMSE: 0.963708 Sum: 691.90610 Total: 745 RMSE: 0.963708 Sum: 691.90610 Total: 745 RMSE: 0.963708 Sum: 691.90610 Total: 745 RMSE: 0.963708 Sum: 691.90610 Total: 745 RMSE: 0.963708 Sum: 691.90610 Total: 745 RMSE: 0.963708 Sum: 691.90610 Total: 745 RMSE: 0.963708 Sum: 691.90610 Total: 745 RMSE: 0.963708 Sum: 691.90610 Total: 745 RMSE: 0.963708 Sum: 691.90610 Total: 745 RMSE: 0.963708 Sum: 691.90610 Total: 745 RMSE: 0.963708 Sum: 691.90610 Total: 745 RMSE: 0.963708 Sum: 691.90610 Total: 745 RMSE: 0.963708 Input: lo-a.1.1.txt Input: lo-a.1.2.txt Input: lo-a.1.4.txt Input: lo-a.1.7.txt Input: lo-a.1.9.txt Input: lo-a.2.1.txt Input: lo-a.2.2.txt Input: lo-a.2.4.txt Input: lo-a.2.7.txt Input: lo-a.2.9.txt Input: lo-a.4.1.txt Input: lo-a.4.2.txt Input: lo-a.4.4.txt Input: lo-a.4.7.txt Input: lo-a.4.9.txt Input: lo-a.7.1.txt Input: lo-a.7.2.txt Input: lo-a.7.4.txt Input: lo-a.7.7.txt Input: lo-a.7.9.txt Input: lo-a.9.1.txt Input: lo-a.9.2.txt Input: lo-a.9.4.txt Input: lo-a.9.7.txt Input: lo-a.9.9.txt

  21. 10_16_10 The Netflix Program: ( Mi, ProbeSup(Mi)={Ui1, …, Uik})  mpp-mpred.C Loops thru ProbeSup, from uservote, movieVOTE writes Predict(Mi,Uik) to predictions  UikProbeSup(Mi) ( Mi , Sup(Mi), Uik , Sup(Uik ))  ( Mi , Sup(Mi), Uik , Sup(Uik ))   vote(Mi ,Uik )  VOTE(Mi ,Uik ) mpp-user.C movie-vote.C user-vote.C prune.C Netflix Classification use the RentsTrainingTable, Rents(MID,UID,Rating,Date) and class label Rating, to classify new (MID,UID,Date) tuples (i.e., predict ratings). Nearest Neighbor User Voting: uid votes on rating(MID,UID) if it is near enough to UID in it’s ratings of movies M={mid1, ..., midk} (i.e., near is based on a User-User correlation over M ). User-User-Correlation? (Pearson, Cosine?) and the set M={mid1,…, midk }. Nearest Neighbor Movie Voting: mid votes on rating(MID,UID) if its ratings by U={uid1,..., uidk} are near enough to those of MID (i.e., near is based on a Movie-Movie correlation over U). Movie-Movie-Correlation? (Pearson or Cosine or?) and set U={uid1,…, uidk }. mpp-mpred.Creads PROBE, loops thru (Mi, ProbeSup(Mi), pass each to mpp-user.C. mpp-mpred.C can call separate instances of mpp-user.C for many Us (in parallel (governed by # of slots.) mpp-user.Cloops thru ProbeSup(M), reads config file, prints prediciton(M,U) to predictions For user votes, mpp-user.C calls user-vote.C For movie votes, mpp-user.C calls movie-vote.C user-vote.C prunes, loops thru user voters, V. calculating a V-vote. Combines V-votes and returns vote. movie-vote.Csimilar. We must loop thru V’s (VPHD rather than HPVD) because the HP required of most correlation calculations is impossible using AND/OR/COMP. Today we will take a close look at the data mining algorithms in movie-vote.C (first the Nearest Neighbor Classification code, then ARM code, then??? Similar (dual) code either exists or will exist in user-vote.C. The file, movie-vote-full.C, contains ARM attempts, Boundary-based attempts and the Nearest Neighbor Classification attempts. The file, movie-vote-justNN.C contains only the NN attempts (so we will start with that). A long term goal: generalize the code away from the Netflix problem and toward a generic data mining system (e.g., for use by the Treeminer Corp. on, say, satellite imagery?)

  22. How does one specify prunings? mpp-mpred.C specifies type of prune ( 3 types: UserPrune with a full range of possibilities; UserFastPrune with just PearsonCorrelation pruning; CommonCoSupportPrune which orders users, V, according to the size of their CommonCoSupport with U only (note that this is a correlation of sorts too.) mpp-user.C movie-vote.C user-vote.C threshold "diff of vectors" population-based std_dev prune specify leftside (from Uid) of an ID interval prune of supM specify the width of an ID interval prune of supM specify starting movie (intercept and slope) for N loop specify starting movie (intercept and slope) for V loop threshold for count based prune specify PearsonCorr threshold (b=bill, meaning: use bill's formula - note if prior pruning this will have a different value than Amal's) specify PearsonCorr threshold (a=Amal, meaning: use Amal's table lookup) threshold "vectorof diffs" population-based std_dev prune threshold "vector of diffs"sample-based std_dev prune threshold (Gaussian of) Euclidean distance based prune threshold for (Gaussian of) 1perpendicular distance prune exponent for (Gaussian of) 1perpendicular distance prune threshold (Gaussian of) a variation based prune threshold std_dev based prune Picks odering for count-based prune below: 1=Amal_Pearson, 2=Bill_Pearson, etc. threshold "diff of vectors"sample-based std_dev prune prune.C In a file (named config) there's a section for specifying the parameters for user-voting and a separate section for specifying parameters for movie-voting. E.g., for movie voting, at the bottom, there are 3 external prunings possible (0 or more can be chosen): 1. an intial pruning of dimensions to be used (since dimensions are user, it prunes supM): 2. a pruning of movie voters, N, (in supU) 3 a final pruning of dimensions (CoSupport(M,N) for the specific movie voter, N. E.g., parameters are specified for this final prune as below. Finally note that internal to user-vote and movie-vote are "internal prunings" in which voters are rejected (during their loop pass) if they fail to meet certain correlation levels). This type of internal pruning is somewhat redundant with the external prunings below. [movie_voting Prune_Users_in_CoSupMN] method = UserCommonCoSupportPrune leftside = 0 width = 8000 mstrt = 0 mstrt_mult = 0.0 ustrt = 0 ustrt_mult = 0.0 TSa = -100 TSb = -100 Tdvp = -1 Tdvs = -1 Tvdp = -1 Tvds = -1 TD = -1 TP = -1 PPm = .1 TV = -1 TSD = -1 Ch = 1 Ct = 2 Note: all thresholds for similarities, not distance i.e., when we start with a distance we follow it with the Gaussian to make it a similarity or correlation.

  23. APPENDIX:Heuristic algorithm to PruneOff v's that can't give low 1pC2u,v values for uQSm U\M 1 2 5 6 8 c 6 5 5 d 3 6 6 7 e 3 f 7 7 7 cQ Q d eQ Q Q fQ Q Since purple is rare, skip it, (not much beneficial pruning): For uQSm, pruneOff v : MidBitsu,vInTop2mTSu differ for >0 movies From the remaining, choose the TopKv 3 0 1 1 4 1 0 0 5 1 0 1 6 1 1 0 7 1 1 1 If any HiBitsv,u differ in TopLmCoSupu,v there is a 3, which is rare, so PruneOff! If all HiBitsv,u are the same PruneOff all v such that MidBitsu,v differ for >H movies From the remaining, choose the TopKv for that uQSm T(U, M,r) c 1 6 110 c 2 5 101 c 5 5 101 d 1 3 011 d 2 6 110 d 5 6 110 d 6 7 111 d 8 6 110 e 1 3 011 f 1 7 111 f 5 7 111 f 8 7 111 m=2UU2(QS2TS2={e,f}{c,d}, n1pC2) 0=Cor2,1 1=Cor2,5 0=Cor2,6 Cor2,8=und Top22={1,5} (e,2)Q TSe={1}=Top22TSe so PruneOff v : MidBitse,vIn{1} differ: none! leaves {c,d}TS2 (f,2)Q TSf={158} {15}=Top22TSf so PruneOff v : MidBitsf,vIn{15} differ: {c,e} leaves {d}TS2 m=5UU5(QS5TS5={e}{c,d,f}, n1pC2) Corr5,1=.33 Corr5,2=1 Corr5,6=0, Corr5,8=1 Top25={2,8} (e,5)Q TSe={1} =Top25TSe so PruneOff none! leaves {c,d,f}TS5 c2 1 1 1 0 0 d2 0 1 1 1 1 e2 0 0 0 0 0 f2 1 0 1 0 1 12 1 0 0 1 221 1 0 0 m=6UU6(QS6TS6={c,f}{d}, n1pC2) Corr6,1=0 Corr6,2=0 Corr6,5=0 Corr6,8=und Top26={2,5} 521 1 0 1 (c,6)Q TSc={125} {25}=Top26TSc so PruneOff v : MidBitsc,vIn{25} differ: {d,f}! leaves {}TS6 62 0 1 0 0 82 0 1 0 1 (f,6)Q TSf={128} {2}=Top26TSf so PruneOff v : MidBitsf,vIn{2} differ: {d}! leaves {}TS6 c1 1 0 0 0 0 d1 1 1 1 1 1 e1 1 0 0 0 0 f1 1 0 1 0 1 m=8UU8(QS8TS8={c,e}{d,f}, n1pC2) Corr8,1=1 Corr8,2=und Corr8,5=1 Corr8,6=und Top28={1,5} 111111 (c,8)Q TSc={125} {15}=Top28TSc so PruneOff v : MidBitsc,vIn{15} differ: {d,f}! leaves {}TS8 2101 0 0 5101 0 1 (e,8)Q TSe={1}=Top28TSe so PruneOff v : MidBitse,vIn{1} differ: none! leaves {d,f}TS8 61 0 1 0 0 81 0 1 0 1 c0 0 1 1 0 0 d0 1 0 0 1 0 e0 1 0 0 0 0 f0 1 0 1 0 1 So substantial pruning except for e 10 0 1 1 1 20 1 0 0 0 50 1 0 0 1 60 0 1 0 0 80 0 0 0 1

  24. Heuristic algorithm to PruneOff v's that can't give low 1pC2u,v values for uQSm U\M 1 2 5 6 8 c 6 5 5 d 3 6 6 7 e 3 f 7 7 7 cQ Q d eQ Q Q fQ Q For uQSm, pruneOff v : MidBitsu,vInTop2mTSu differ for >1 movies From the remaining, choose the TopKv 3 0 1 1 4 1 0 0 5 1 0 1 6 1 1 0 7 1 1 1 T(U, M,r) c 1 6 110 c 2 5 101 c 5 5 101 d 1 3 011 d 2 6 110 d 5 6 110 d 6 7 111 d 8 6 110 e 1 3 011 f 1 7 111 f 5 7 111 f 8 7 111 m=2UU2(QS2TS2={e,f}{c,d}, n1pC2) 0=Cor2,1 1=Cor2,5 0=Cor2,6 Cor2,8=und Top22={1,5} (e,2)Q TSe={1}=Top22TSe so PruneOff v : MidBitse,vIn{1} differ for >1: none! {c,d}TS2 (f,2)Q TSf={158} {15}=Top22TSf so PruneOff v : MidBitsf,vIn{15} differ for >1: none {c,d}TS2 m=5UU5(QS5TS5={e}{c,d,f}, n1pC2) Corr5,1=.33 Corr5,2=1 Corr5,6=0, Corr5,8=1 Top25={2,8} (e,5)Q TSe={1} =Top25TSe so PruneOff none! leaves {c,d,f}TS5 c2 1 1 1 0 0 d2 0 1 1 1 1 e2 0 0 0 0 0 f2 1 0 1 0 1 12 1 0 0 1 221 1 0 0 m=6UU6(QS6TS6={c,f}{d}, n1pC2) Corr6,1=0 Corr6,2=0 Corr6,5=0 Corr6,8=und Top26={2,5} 521 1 0 1 (c,6)Q TSc={125} {25}=Top26TSc so PruneOff v : MidBitsc,vIn{25} differ >1: {d}! leaves {}TS6 62 0 1 0 0 82 0 1 0 1 (f,6)Q TSf={128} {2}=Top26TSf so PruneOff v : MidBitsf,vIn{2} differ>1: {}! leaves {d}TS6 c1 1 0 0 0 0 d1 1 1 1 1 1 e1 1 0 0 0 0 f1 1 0 1 0 1 m=8UU8(QS8TS8={c,e}{d,f}, n1pC2) Corr8,1=1 Corr8,2=und Corr8,5=1 Corr8,6=und Top28={1,5} 111111 (c,8)Q TSc={125} {15}=Top28TSc so PruneOff v : MidBitsc,vIn{15} differ>1 none! leaves {d,f}TS8 2101 0 0 5101 0 1 (e,8)Q TSe={1}=Top28TSe so PruneOff v : MidBitse,vIn{1} diffe>1: none! leaves {d,f}TS8 61 0 1 0 0 81 0 1 0 1 c0 0 1 1 0 0 d0 1 0 0 1 0 e0 1 0 0 0 0 f0 1 0 1 0 1 So substantial pruning except for e 10 0 1 1 1 20 1 0 0 0 50 1 0 0 1 60 0 1 0 0 80 0 0 0 1

  25. Heuristic algorithm to PruneOff v's that can't give low 1pC2u,v values for uQSm U\M 1 2 5 6 8 c 6 5 5 d 3 6 6 7 e 3 f 7 7 7 cQ Q d eQ Q Q fQ Q 3 0 1 1 4 1 0 0 5 1 0 1 6 1 1 0 7 1 1 1 We note that this can be done independent of movies. That is, given any uQ|U, PruneOff noncompetitive n1pC2 v's using the MidBit only. T(U, M,r) c 1 6 110 c 2 5 101 c 5 5 101 d 1 3 011 d 2 6 110 d 5 6 110 d 6 7 111 d 8 6 110 e 1 3 011 f 1 7 111 f 5 7 111 f 8 7 111 cQ|U PruneOff v : any MidBitsInCoSuppc,v differ: Prune {d,f} dQ|U PruneOff if any MidBitsInCoSuppd,v differ: Prune {} eQ|U PruneOff if any MidBitsInCoSuppe,v differ: Prune {} c2 1 1 1 0 0 d2 0 1 1 1 1 e2 0 0 0 0 0 f2 1 0 1 0 1 12 1 0 0 1 221 1 0 0 521 1 0 1 62 0 1 0 0 82 0 1 0 1 c1 1 0 0 0 0 d1 1 1 1 1 1 e1 1 0 0 0 0 f1 1 0 1 0 1 111111 2101 0 0 5101 0 1 61 0 1 0 0 81 0 1 0 1 c0 0 1 1 0 0 d0 1 0 0 1 0 e0 1 0 0 0 0 f0 1 0 1 0 1 10 0 1 1 1 20 1 0 0 0 50 1 0 0 1 60 0 1 0 0 80 0 0 0 1

  26. = xoyy = xoyy xyshad  xoyy yoy |y| |y| |y|2 SLxyshad SignedLenxyshad = xo y xyperp x - xyshad = x - xoy y |y| yoy xyperp Strategy-1 SL(v-u)1shad =v-u= v's vote downshift in predicting um, so v casts its ballots for rating, vm-v+u xyshad 1 n *yshad (*yperp) linear |xyperp|2 = |x|2 - |xyshad|2 vkRu,m Corrm,k* Corrv,u S1RR:um= vkRu,mCorrm,k*Corrv,u*(vm- vk+ uk) S1RV.aum= vUu.m, kMum,vCorrm,k*Corrv,u * (vm- v + u ) S1RV:um= vUu.m, kMu.mCorrm,k*Corrv,u * (vm- v + u ) vkRu,mCorrm,k*Corrv,u vUu,m kMu,mCorrm,k *Corrv,u vUu,m kMum,vCorrm,k *Corrv,u S1VR:um= vkRu,mCorrm,k *Corrv,u* (vm- vk+ uk) S1VV:um= vUu.m Corru,v* (vm- v + u ) vUu,m Corru,v The shadow vector made by x on y, xyshad  dot product of x with a unit vector in the y-directiontimes that unit vector. y(1..1)=1: yoy=n, |y|=n, xoy=xk x1shad= x1 =1 y x1perp = x - x1 SLx1shad=xo1/n= nx  L(v-u)1perp = | v-u-(v-u) |measures signal impurity and inversely determines 1perpCorru,v or 1pCu,v x =v-u L(v-u)1perpis a pseudo-metric: Symmetric |(v-v)-(u-u)|=|-[(u-u)-(v-v)]|=|(u-u)-(v-v)|. Triangle inequality: |(w-w)-(u-u)|=|(w-w)-(v-v)+(v-v)-(u-u)|  |(w-w)-(v-v)|+|(v-v)-(u-u)| Ru,m any pruned set of ratings : vkRu,m vSuppm and uSuppk, e.g., a DataWarehouse dice, UM{vk | vU, kM}. eg, CTopK(CoSuppv,u) Ru,m=Uu,mMu,m 1pCorru,v:linear: N-|(v-u)1perp|, reciporical: 1/(1+|(v-u)1perp| ) quadratic: N-|(v-u)1perp|2 or N-(v-u)1po(v-u)1p(N : 0) Gaussian: ae- b|(v-u)1perp|2(a cancels numerator-denominator),custom(Note: Only 6 ratings, so can custom build as a case stmt) Other Correlations:Pearson's PCorr; exact matchEMCorr uses L(v-u)Here length assumes a distance, Lp. Even Hamming (Tingda EMCorr uses: L(v-u)=#nonmatches, so |CoSuppu,v|-L(v-u) = #matches). Correlation is often relativized by dividing by CoSuppu,v (Pearson divides by product of variances?). Movie Corr is the same.

  27. Pearsonu,v = kCoSuppu,v[uk- u]*[vk- v] / (kCoSuppu,v[uk-u]2)(kCoSuppu,v[vk-v]2) Collaborative Filtering and GroupLens Herlocker, Konstan and Riedl Pa,iis: um= u + vUu,mCorru,v*(vm - v ) vUu,mCorru,v For small z, wa,u: Corru,v= kCoSuppv,u [z+uk- u]*[z+vk- v] (z2+k[uk-u]2)(z2+k[vk-v]2) or EMCorrm,n. Rotation: um = m+nTopK(m)Corrm,n*(nu - m ) / nTopK(m) Corrm,n v 1 v1perp u1perp u = u1perpov1perp/|u1perp|2 * |v1perp|2 = [u- u1]o[v- v1] / [u-u1]o[u-u1] * [v-v1]o[v-v1] (v-v- u+u)o(v-v - u+u) = (v-v1)o(v-v1) -2(v-v1)o(u-u1) +(u-u1)o(u-u1) Strategy-2, S2, (u based) use Pearson. S1 is a vm-based strategy. S2 on um's where exact match is strong and for others, S1? Partition S1, S2 voters, then vote linear combo (not linear combo over all voters). = cos[u1Plane, v1Plane] = u1perpov1perp/ |u1perp|*|v1perp| 1perpCorrv,u uses Len(v-u)1perp= |v-u- v+u| over Mu,mSuppu TopK1perpCorrelatedMoviesm(no Suppv ?). 1perpCorrm,n uses Len(n-m)1perp= |n-m- n+m| over Uu,m=CoSuppm,n . If all Corrv,u are small (all TopK vs are very pure shifts) issue fixed #ballots, (vertically computable), (or podium rings where Corrv,u is nearly constant - then on each ring, issue a constant # ballots...) Vertical calculation?, could use 1perpCorr2, e.g.,1perpCorr2v,u = g((v-u- v+u)o(v-u - v+u)) for some droppoff, g -2vou +2vou1 +2v1ou -2v1ou1 = vov -vov1 -v1ov +v1ov1 WikiPed: Peason = cosine of angle of 2 vectors Caution: need centered data (shift by sample mean so avg = 0. Some like uncentered (nonPearson). = vov -2vvo1 +nv2 -2vou +2uvo1 +2v1ou -2nvu -2vou +2nuv +2nvu -2nvu = vov -2nv2 +nv2 + uou -nu2 = vov -nv2 -2vou +2nuv n1pC2u,v  (v-u)o(v-u) / n - [v-u]2 where n=|CoSuppu,v| = vov-2vou+uou-nv2 +2nuv-nu2 = vov -2vou +uou -n[v2 -2uv +u2] = (v-u)o(v-u)-n[v-u]2

  28. Pu (i)=1 iff miSuppui=1..17770 Pm(i)=1 iff uiSuppm{ui}Suppmi=1..480189 m1 m17770 m1 Pm,r(i)=1 iff uirSuppm r=3..7 Pu,r(i)=1 iff mirSuppu MM m\uPtrees: u\mPtrees: Pm,0(i)=1 iff MOD((ui)m,2) = 1 Pu,0(i)=1 iff MOD(u(mi) ,2)=1 TopK Pm,1(i)=1 iff MOD(DIV((ui)m,2),2)=1 Pu,1(i)=1 iff MOD(DIV(u(mi) ,2),2)=1 m17770 Corrm,n Pm,2(i)=1 iff MOD(DIV(DIV((ui)m,2),2),2)=1 Pu,2(i)=1 iff MOD(DIV(DIV(u(mi) ,2),2),2)=1 M D Suppm TopK Predict um: pruneSupputo TopK (ClosedTopK?) wrt Corrm,n , m1 m17770 ui1 u1 uiQM u480189 u1 u1 u1 pruneSuppmto TopJ (ClosedTopJ?) wrt Corru,v U UM UU O (=One) compression: Let {Ij}j=1..n partition userID space [movieID], u\m Ovectors (len=n, one  component) and Zvectors (len=|comp| ): Suppu Om( j)=1 iff Ij Suppm then Zm,j(k)=1 iff {uj+k}Suppmk=1..len(Ij) Om,r( j)=1 iff Ij rSuppm then Zm,j,r(k)=1 iff {uj+k}rSuppm ... um,D u480189 Corru,v u480189 u480189 T (=Two) compression: {Ij}i=1..n partition userIDs. Construct u\m T vecs as above:  P vec, another level of compression, construct O vecs Netflix compression? Not necessary for u\m and m\u! But we might consider it for um\r  Main(u,m,r). e.g., partition U into 481 width=1000 intervals: Om( j)=1 iff {uj+k}k=0..999Suppm and then Zm,j(k)=1 iff {uj+k}Suppm or 693 width=693 intervals: Om( j)=1 iff {uj+k}k=0..692Suppm and then Zm,j(k)=1 iff {uj+k}Suppm 693*134=92862 width (u,m)-intervals:O(j,a)=1iff {(uj+k,ma+b)| r(uj+k,ma+b)>0}k=0..693, b=0..134j=1..693, a=1..134 thenZj,a(k,b)=1 iff r(uj+kma+b) >0 (6932 =480249 1342 =17956 792*272 width=79*27 (u,m)-ints (793 =493039 273 =19683)T(j,a)=1iff {(uj+k,ma+b)| r(uj+k,ma+b)>0}k=0..79, b=0..27 Oj,a(k,b)=1iff {(uj+k+l,ma+b+c)| r(uj+k+l,ma+b+c)>0}l=0..79, c=0..27 Zj+k,a+b(l,c)=1 iff r(uj+k+lma+b+c)>0 (all lengths=79*27=2133) UUM card QSM u UUm QSuppm TSuppm TSM 2 axes Lsts (L) by listing positions satisfypredicate, not bitmapping positions. Rolodex model 3 interaction cards We need only partial UUTbl indexed by movies: mQ|M, we need only UUmQSuppmTSuppm Since Q and T are roughly equal size, about the same as TSuppmTSuppm but QSmTSm gives exact TopKs. So umQ, need TopK u-correlated vsTSm. So m, we compute Corru,v uQSuppm and vTSuppm then record only the TopK of them! And if the Corru,v has previously been computed (for some other movie, we can skip it.

  29. For a pure shift on n-1 movies that is a pure shift except for 1 movie, then |(v-u)1perp| = |v-u-v+u| = (n-1)2/n2 + (n-1)*(1/n)2 = (n2-2n+1 + n-1) / n2 = (n2-n)/n2= 1-1/n 1 as n u\C1,C2.. 1=(1,1) 1 1 0 2 1 0 3 1 0 4 1 0 5 0 1 6 0 1 7 1 1 u1 u2 u3 u4 u5 u6 u7 (v2-u2)1perp = (1/2, -1/2) Length = 1/2 = 0.7071 UUTbl: u1 u2 1perpCorr 1 2 1 3 2 3 1 4 2 4 3 4 5 6 5 7 6 7 1 7 2 7 3 7 4 7 creator 1 u1 u2 u3 u4 u5 u6 u7 v2-u2= (s+1,s) pC2,1 1 pC3,1 pC3,2 1 1=(1,1,1) pC4,1 pC4,2 pC4,3 1 creator 2 pC6,5 2 (v1-u1)1perp=(2/3,-1/3,-1/3) Length = 6 / 3 = 0.8165 pC7,1 pC7,2 pC7,3 pC7,4 pC7,5 pC7,6 1,2 ClusterTHRESH: C\ pCorrs,MaxpCorr,Creator,uCnt v1-u1 = (s+1,s,s) DIC (Dyn Itemset Cntg): form um\P (by concat each of Amal's u\m basic ptrees (into 2650 user-segs): u1-u1000=seg1; u2649001-u2649429=se2650 To get 1pCs, could get v-u-v+u by circularly shifting copy of basic Ptrees down 17770 (to get all u - (v+1) > pairings). fork proc doing same with 2*17770 circular downshift... a 2650 partition of um\P: u m P2 P1 P0 u1 m1 1 1 0 : u1 m17770 ... u2649429 m1 : u2649429 m17770 0 0 1 Length Choices: L2 (Eucl Len) : |v-u|2 = ( (vi-ui)2 ); L1 (Manh): |v-u|1= |vi-ui|; L(Max): |v-u|=MAX |vi-ui|. abs value, | | comp probs? |v-u|new = |(vi-ui)| best computationally, but measure impurity badly (wrt 1=(1,...,1) ) (cancelations across coords). close enough though? Include full formula of impurity, |v-u-vb+ub| better? (centering vi's and ui's should reduce ill effects of cancels?) 1pC2 afterall (w Eucl Len), then impurity: (v-vb-u+ub)o(v-v-u+u) = (vi-v-ui+u)2 where (v-v-u+u)o(v-v-u+u). it is all set for app of Gaussian in the second part (vote)? Impurity (if defined as 1pCu,v |(v-u)1perp|=|v-u-v+u| ) incr with n, so the higher the n the fewer ballots. AND if we apply a Gaussian to get #ballots, the rather slight difference can be exagerated? But v9 (CoSupp=10 with 9 pureshifts) should be considered a stronger signal than a v1 (CoSupp=2). But |(v9-u)1perp|=0.94 while |(v1-u)1perp |=0.71  v1 would be considered more pure than v9 ! The formula for impurity may need to be adjusted? If impurity  |(v-u)1perp| / |CoSuppv,u|, (Normalized ImPurity=NIP), NIP(v9,u)=.094 , NIP(v1,u)=0.35 (v9 is ~4 times purer as v1). 1pCv,ue -NIP2 (v,u), 1pCv9,u=.99, 1pCv1,u=.89 (Not much difference!) Try 0.1/x2: 1pCv9,u=11.3, 1pCv1,u=0.8. Try custom dropoff close to .1/x2? or 1pCv,u {|CoSuppv,u|/(1+L(v-u)1perp)}2 or custom approximation. 1pC(v9,u)~26.6 1pC(v1,u)~2.2 1pC(v1.2,u)~1.4 1pC(v9.2,u)=25 1pC(v9.3,u)=21.8 where vc,k has CoSupp=c and 1 nonpure shift. Cluster Users prior to building UUtable: (so only have to choose pairs from the same cluster component) DIC method. do a lower diagonal of (u,u+1)'s at a time. create 480188 entries on -1 diagonal (oblique diag below main diag by 1), then the -2 diagonal (below by 2), ..., then -480188 digonal. then delete 1st and last segments, -480189 diagonal ..., then -960377 diagonal At that point we can delete the 2nd and 2ndLast segments . . . 240094 such rounds tho ;-( apply Dynamic Itemset Countng (DIC) AFTER clustering U's (and sort). Do DIC technique inside each cluster only!

  30. 2 THRESHOLDs give up info. can retain all info by (though it might increase storage unacceptably or cause some other problem?): For 1st u, start cluster COMPu (u is called its creator).  new w, if w within t1 of an existing creator, x, put it in COMPx else if w within t2 of an existing creator, x, put it in compx and create COMPw else create COMPw 3 THRs? ( e.g., t1=.15 t2=.25 t3=.4 ) (distinguish comp user arrived via t1, t2, t3. 3 tbls: 1UUTbl, 2UUTbl, 3UUtbl (2UU, 3UU small.) For 1st u, start cluster 1compu (u is called its creator).  new w, if w within t1 of creator, x, put it in 1compx else if w within t2 of creator, x, put it in 2compx, create 1compw else if w within t3 of creator, x, put it in 3compx, create 1compw else create 1compw. nvRu,m Corrm,n* Corrv,u S1RR:mu= nvRu,mCorrm,n*Corrv,u*(nu- nv+ mv) S1RV.amu= vUu.m, nMum,vCorrm,n*Corrv,u * (nu- n + u ) S1RV:mu= vUu.m, nMu.mCorrm,n*Corrv,u * (nu- n + m ) nvRu,mCorrm,n*Corrv,u vUu,m nMum,vCorrm,n *Corrv,u vUu,m nMu,mCorrm,n *Corrv,u S1VR:mu= nvRu,mCorrm,n *Corrv,u* (nu- nv+ mv) S1VV:mu= nMu.mCorru,v* (nu- n + m ) nMu,m Corru,v Build ReducedUUTbl: RUUTbl(Creator, v, 1pC, |CoSupp|) (index col_v ?). v (in desc|Supp| order) and existing Creator if NS1pC(v,Creator)<c1^|CoSupp|>s1 add (Creator,v,NS1pC,CS) (? limit to topK creators?) elseif NS1pC(v,Creator)<c2^|CoSupp|>s2 add (Creator,v)fzzyTbl; (v,v,0,|Supv|)UUTbl else add (v,v,0,|Supv|)|UUTbl fzzyTbl(Creator, fzzyMember, 1pC) where fzzyMember is a the new Creator that didn't make s1 but did make s2 n1pCu,v  1pC2u,v / CoSuppu,v I suggest setting c1 and c2 very small and when classifying um, if clust(u) has too few neighbors in it, resort to exhaustive search for the Top10(u). Set s1=.2 (or .17?) s2 =s1 since dividing by |CoSupp| already favors large |CoSupp| to a great extent.(reason we need |CoSupp|>1: 1pC=0 always when CoSup=1 Strategy-1mSL(n-m)1shad =n-m = n's vote downshift in predicting mu. So, u casts for nu-n+m L(n-m)1perp= |n-m-(n-m)|is the signal impurity and inversely determines 1perpCorrm,n=1pCm,n eg, CTopK(CoSuppn,m) Ru,m=Uu,mMu,m Comment: Don't expect too many (if any) pure shifted Movie signals (movie, n, that is a ratings vector over the user dims, CoSuppm,n). And even if there was, e.g., a pure +1 signal, n, for m, it would be a movie such that the users who rated both n and m all rated m 1 lower than they rated n. u is not one of those users (since u did not rate m). It does not seem intuitive that therefore u would necessarily rate m 1 lower than it rated n.

  31. RMSE SE minum 0.025 0.142 0.351 0.308 0.398 0.364 0.342 0.331 0.397 0.389 0.371 0.418 0.425 .25 0.410 0.406 0.397 0.401 0.396 0.398 0.412 0.437 0.427 0.431 0.432 0.423 # RMSE_F SqErF prdR_F gvnRF RMSE_u SqEr_u prdR_u RMSE_m SqEr_m prdR_m MOVIE ID=1 mean Rating=3.74954 var Rating=1.13837 user Cnt=547 t=549 1 0.19 0.035 4.19 4 0.026 0.00066 3.97 0.4 0.16 4.4 2 0.24 0.0762 3.72 4 0.53 0.566 3.25 0.32 0.04 4.2 3 0.34 0.237 3.49 3 0.55 0.329 3.57 0.35 0.16 3.4 4 0.31 0.0406 4.8 5 0.48 0.0106 4.9 0.34 0.09 4.7 5 0.48 0.759 4.13 5 0.51 0.413 4.36 0.58 1.21 3.9 MOVIE ID=10 mean Rating=3.18 var Rating=1.28 user Cnt=249 6 0.44 0.0317 3.18 3 0.47 0.00192 2.96 0.55 0.16 3.4 7 0.41 0.000299 3.02 3 0.44 0.0274 2.83 0.52 0.04 3.2 MOVIE ID=1000 mean Rating=3.28 var Rating=1.52 user Cnt=760 8 0.39 0.0137 3.12 3 0.42 0.055 3.23 0.48 0 3 9 0.53 1.37 4.17 3 0.47 0.547 3.74 0.7 2.56 4.6 10 0.57 0.727 2.85 2 0.63 1.97 3.4 0.67 0.09 2.3 11 0.55 0 5 5 0.6 0 5 0.64 0 5 12 0.61 1.18 3.92 5 0.61 0.59 4.23 0.73 1.96 3.6 MOVIE ID=10000 mean Rating=2.9 var Rating=1.18 user Cnt=215 13 0.82 4.3 2.93 5 0.75 2.72 3.35 0.99 6.25 2.5 14 0.79 0.0709 3.73 4 0.72 0.00453 4.07 0.97 0.36 3.4 MOVIE ID=10001 mean Rating=3.8 var Rating=1.61 user Cnt=158 15 0.77 0.000577 4.02 4 0.7 0.124 3.65 0.94 0.16 4.4 16 0.74 0.025 4.16 4 0.69 0.266 4.52 0.91 0.04 3.8 17 0.74 0.341 4.58 4 0.68 0.218 4.47 0.9 0.49 4.7 18 0.72 0.0747 3.73 4 0.69 0.717 3.15 0.88 0.09 4.3 19 0.7 0.0325 3.82 4 0.68 0.193 4.44 0.87 0.64 3.2 MOVIE ID=10002 mean Rating=3.44 var Rating=1.73 user Cnt=2624 20 0.74 1.71 3.69 5 0.68 0.38 4.38 0.96 4 3 21 0.79 2.07 2.56 4 0.68 0.61 3.22 1 4.41 1.9 22 0.77 0.00201 4.96 5 0.67 0.00201 4.96 1 0.00201 4.96 23 0.76 0.0656 4.74 5 0.66 0.262 4.49 1 0 5 MOVIE ID=10003 mean Rating=2.47 var Rating=1.54 user Cnt=107 24 0.76 0.684 2.17 3 0.65 0.206 2.55 1 1.44 1.8 MOVIE ID=10004 mean Rating=4.21 var Rating=0.911 user Cnt=4021 25 0.750.0418 4.8 5 0.640.00007 4.99 0.990.16 4.6 Given um to predict, take Top10 e.g., for RMDS_u, from uu then take top10 mm from CoSupp(u,v) RMSE_F = using 50/50 combo of both approaches; RMSE_u = using uu to find NN and then mm; RMSE_m = using mm to find NN and then uu Characterize um= 2110100016100011810001(SqEr_m << SEr_u)? If a signal, use prdR_m! (result RMSE = 0.527) e.g., CoSuppu,v>>CoSuppm,n? or something else we can pick up from UUTbl and MMTbl?? or even from Suppu and Suppm? Characterize 1310000(SqEr_u SqEr_m both off)? try another method? use avg? (if e.g., uSuppu=4.5, res RMSE = 0.423) Cluster for Partial UUTbl (using 1 or 2 threshold?). Top10 from PUUTbl. u, if Top10 not good enough, generate dyn. Set lower 1pC Limit (instead of Top10?). E.g., if 4 v's with 1pC>Lim, use. Count #cluster nbrs >Lim, if, say, 4 predict with only 4, if not generate Top10 dynamically? Amal: agree, except idea to go for threshold when classifying (wting (exp) will automaticaly take care of insignificant nbrs!) Mark users whose cluster gives poor Top10? Then use entire U for those so that we improve Top10 as much as possible. One way: check # users in cluster with 1pC > Limit (For some, it will be because 10 don't exist with 1ipC>Limit in U). Record # users with 1pC>Lim or just keep the avg 1pC per cluster. If lo, use U! In case need a small boost to RMSE.

  32. RMSE SE minum 0.025 0.527 0.543 0.473 0.511 0.467 0.437 0.417 0.464 0.450 .09 0.429 0.467 0.469 .25 0.453 0.449 .16 0.454 0.454 0.485 .09 0.482 0.490 0.507 0.496 0.497 0.495 0.485 Amal's Set-2 (bold set): Using rnded predictions helps? No! RMSEu 0.39 0.62 0.73 0.68 0.61 0.56 0.60 0.56 0.60 0.67 0.72 0.69 0.68 0.66 0.64 0.62 0.66 0.65 0.64 0.63 0.64 0.63 0.76 0.75 0.74 SEu 0.15 0.62 0.82 0.27 0.01 0.04 0.64 0.02 0.73 1.27 1.18 0.00 0.36 0.00 0.00 0.06 1.21 0.17 0.27 0.23 0.54 0.29 4.36 0.44 0.08 roundedSEu 0 1 1 1 0 0 1 0 1 1 1 0 1 0 0 0 1 0 1 0 1 1 4 1 0 roundedRMSEu 0 0.70 0.81 0.86 0.77 0.70 0.75 0.70 0.74 0.77 0.79 0.76 0.78 0.75 0.73 0.70 0.72 0.70 0.72 0.70 0.72 0.73 0.83 0.84 0.82 # RMSE_F SqErF prdR_F gvnRF RMSE_u SqEr_u prdR_u RMSE_m SqEr_m prdR_m 11 0.19 0.035 4.19 4 0.026 0.00066 3.97 0.4 0.16 4.4 21 0.24 0.0762 3.72 4 0.53 0.566 3.25 0.32 0.04 4.2 31 0.34 0.237 3.49 3 0.55 0.329 3.57 0.35 0.16 3.4 41 0.31 0.0406 4.8 5 0.48 0.0106 4.9 0.34 0.09 4.7 51 0.48 0.759 4.13 5 0.51 0.413 4.36 0.58 1.21 3.9 610 0.44 0.0317 3.18 3 0.47 0.00192 2.96 0.55 0.16 3.4 710 0.41 0.000299 3.02 3 0.44 0.0274 2.83 0.52 0.04 3.2 81000 0.39 0.0137 3.12 3 0.42 0.055 3.23 0.48 0 3 91000 0.53 1.37 4.17 3 0.47 0.547 3.74 0.7 2.56 4.6 1010000.57 0.727 2.85 2 0.63 1.97 3.4 0.67 0.09 2.3 111000 0.55 0 5 5 0.6 0 5 0.64 0 5 121000 0.61 1.18 3.92 5 0.61 0.59 4.23 0.73 1.96 3.6 13100000.824.32.93 5 0.752.723.35 0.996.252.5 1410000 0.79 0.0709 3.73 4 0.72 0.00453 4.07 0.97 0.36 3.4 1510001 0.77 0.000577 4.02 4 0.7 0.124 3.65 0.94 0.16 4.4 16100010.74 0.025 4.16 4 0.69 0.266 4.52 0.91 0.04 3.8 1710001 0.74 0.341 4.58 4 0.68 0.218 4.47 0.9 0.49 4.7 18100010.72 0.0747 3.73 4 0.69 0.717 3.15 0.88 0.09 4.3 1910001 0.7 0.0325 3.82 4 0.68 0.193 4.44 0.87 0.64 3.2 2010002 0.74 1.71 3.69 5 0.68 0.38 4.38 0.96 4 3 2110002 0.79 2.07 2.56 4 0.68 0.61 3.22 1 4.41 1.9 2210002 0.77 0.00201 4.96 5 0.67 0.00201 4.96 1 0.00201 4.96 2310002 0.76 0.0656 4.74 5 0.66 0.262 4.49 1 0 5 2410003 0.76 0.684 2.17 3 0.65 0.206 2.55 1 1.44 1.8 25100040.750.0418 4.8 5 0.640.00007 4.99 0.990.16 4.6 Suppose we have characterized the greens. A simple signal for the blues: If SqEr_u > 2*RMSE then use prdR_m. ((if, e.g., we have used uSuppu=4.5 for the 131000 then using the simple blue signal above, the res RMSE = 0.485 Pretty good! Another simple blue signal: If SqEr_u > 1.5*RMSE use prdR_m. ((if, e.g., we have used uSuppu=4.5 for the 131000 then using the simple blue signal above, the res RMSE = 0.485 no better! Another very simple signal for the blues: If SqEr_u > 2*RMSE and if var(uSuppu) small, use prediction, uSuppu. Another very simple signal for the blues: If SqEr_u > 2*RMSE use prediction, uSuppuTop10m.

  33. RMSE cumm_F 0.19 0.24 0.34 0.31 0.48 0.44 0.41 0.39 0.53 0.57 0.55 0.61 0.82 0.79 0.77 0.74 0.74 0.72 0.7 0.74 0.79 0.77 0.76 0.76 0.75 0.741 0.757 0.778 0.765 0.752 0.741 0.730 0.719 0.715 0.705 0.723 0.713 0.709 0.700 0.692 0.689 0.690 0.687 0.685 0.678 0.676 0.670 0.712 0.712 0.705 RMSE cumm_u 0.025 0.527 0.543 0.473 0.511 0.467 0.437 0.417 0.464 0.625 0.596 0.612 0.745 0.718 0.699 0.689 0.678 0.689 0.678 0.675 0.680 0.665 0.659 0.652 0.638 0.631 0.637 0.649 0.645 0.634 0.625 0.631 0.622 0.630 0.650 0.666 0.657 0.655 0.647 0.639 0.632 0.647 0.640 0.637 0.634 0.637 0.635 0.697 0.696 0.690 RMSE cumm_m 0.4 0.32 0.35 0.34 0.58 0.55 0.52 0.48 0.7 0.67 0.64 0.73 0.99 0.97 0.94 0.91 0.9 0.88 0.87 0.96 1 1 1 1 0.99 0.977 1.207 1.312 1.140 1.020 0.939 0.890 0.832 0.791 0.841 0.892 0.855 0.833 0.802 0.786 0.787 0.769 0.761 0.758 0.739 0.726 0.709 0.761 0.763 0.747 # RMSE_F SqErF prdR_F gvnRF RMSE_u SqEr_u prdR_u RMSE_m SqEr_m prdR_m 11 0.19 0.035 4.19 4 0.026 0.00066 3.97 0.4 0.16 4.4 21 0.24 0.0762 3.72 4 0.53 0.566 3.25 0.32 0.04 4.2 31 0.34 0.237 3.49 3 0.55 0.329 3.57 0.35 0.16 3.4 41 0.31 0.0406 4.8 5 0.48 0.0106 4.9 0.34 0.09 4.7 51 0.48 0.759 4.13 5 0.51 0.413 4.36 0.58 1.21 3.9 610 0.44 0.0317 3.18 3 0.47 0.00192 2.96 0.55 0.16 3.4 710 0.41 0.000299 3.02 3 0.44 0.0274 2.83 0.52 0.04 3.2 81000 0.39 0.0137 3.12 3 0.42 0.055 3.23 0.48 0 3 91000 0.53 1.37 4.17 3 0.47 0.547 3.74 0.7 2.56 4.6 1010000.57 0.727 2.85 2 0.63 1.97 3.4 0.67 0.09 2.3 111000 0.55 0 5 5 0.6 0 5 0.64 0 5 121000 0.61 1.18 3.92 5 0.61 0.59 4.23 0.73 1.96 3.6 13100000.824.32.93 5 0.752.723.35 0.996.252.5 1410000 0.79 0.0709 3.73 4 0.72 0.00453 4.07 0.97 0.36 3.4 1510001 0.77 0.000577 4.02 4 0.7 0.124 3.65 0.94 0.16 4.4 16100010.74 0.025 4.16 4 0.69 0.266 4.52 0.91 0.04 3.8 1710001 0.74 0.341 4.58 4 0.68 0.218 4.47 0.9 0.49 4.7 18100010.72 0.0747 3.73 4 0.69 0.717 3.15 0.88 0.09 4.3 1910001 0.7 0.0325 3.82 4 0.68 0.193 4.44 0.87 0.64 3.2 2010002 0.74 1.71 3.69 5 0.68 0.38 4.38 0.96 4 3 2110002 0.79 2.07 2.56 4 0.68 0.61 3.22 1 4.41 1.9 2210002 0.77 0.00201 4.96 5 0.67 0.00201 4.96 1 0.00201 4.96 2310002 0.76 0.0656 4.74 5 0.66 0.262 4.49 1 0 5 2410003 0.76 0.684 2.17 3 0.65 0.206 2.55 1 1.44 1.8 2510004 0.75 0.0418 4.8 5 0.64 0.00007 4.99 0.99 0.16 4.6 u is still winning. Suggest looking for a signal as to when to subsitute prdR_m for prdR_u rather than combine over all of them. 110091 0.49 0.243 3.49 3 0.39 0.149 3.39 0.6 0.36 3.6 2 0.85 1.2 3.1 2 0.62 0.627 2.79 1.1 1.96 3.4 3 0.98 1.45 3.2 2 0.73 0.824 2.91 1.2 2.25 3.5 4 0.85 0.0254 3.16 3 0.68 0.269 3.52 1.1 0.04 2.8 510092 0.76 0.00337 3.94 4 0.61 0.0135 3.88 0.96 0 4 6 0.71 0.0637 4.25 4 0.57 0.042 4.2 0.89 0.09 4.3 7 0.66 0.0224 3.85 4 0.61 0.639 3.2 0.84 0.25 4.5 8 0.61 0.00687 3.92 4 0.57 0.0275 3.83 0.79 0 4 9 0.61 0.334 4.42 5 0.61 0.732 4.14 0.75 0.09 4.7 1010094 0.58 0.00114 2.97 3 0.68 1.28 4.13 0.8 1.44 1.8 11 0.66 1.43 3.81 5 0.73 1.18 3.91 0.86 1.69 3.7 12 0.63 0.00174 3.96 4 0.69 0.000271 4.02 0.83 0.01 3.9 13 0.63 0.301 3.45 4 0.69 0.356 3.4 0.81 0.25 3.5 14 0.6 0.000546 4.02 4 0.66 0.00219 4.05 0.78 0 4 1510095 0.59 0.0702 4.73 5 0.64 0.000901 4.97 0.76 0.25 4.5 16 0.58 0.28 4.47 5 0.62 0.0664 4.74 0.76 0.64 4.2 17 0.59 0.563 4.25 5 0.66 1.21 3.9 0.75 0.16 4.6 18 0.59 0.26 4.49 5 0.65 0.176 4.58 0.74 0.36 4.4 19 0.59 0.369 4.39 5 0.64 0.266 4.48 0.74 0.49 4.3 2010096 0.58 0.0581 4.24 4 0.64 0.233 4.48 0.72 0 4 21 0.58 0.324 3.43 4 0.64 0.546 3.26 0.71 0.16 3.6 22 0.57 0.0729 4.27 4 0.64 0.292 4.54 0.69 0 4 23 0.67 3.22 3.21 5 0.76 4.36 2.91 0.74 2.25 3.5 24 0.67 0.537 3.73 3 0.76 0.444 3.67 0.75 0.64 3.8 25100970.660.0212 2.85 3 0.740.0847 2.71 0.73 0 3

  34. # RMSEF SqErF prdF gvnR RMSEu SqEru prdu RMSEm SqErm prdm usAvg usVar usMvC mID mvAvg mvVar mvUsC 1 0.19 0.035 4.19 4 0.026 0.0006 3.97 0.4 0.16 4.4 3.633 0.685 1290 1 3.75 1.14 547 2 0.24 0.076 3.72 4 0.53 0.566 3.25 0.32 0.04 4.2 3.233 1.13 1001 1 3.75 1.14 547 3 0.34 0.237 3.49 3 0.55 0.329 3.57 0.35 0.16 3.4 3.55 1.18 60 1 3.75 1.14 547 4 0.31 0.041 4.8 5 0.48 0.0106 4.9 0.34 0.09 4.7 4.656 0.713 160 1 3.75 1.14 547 5 0.48 0.759 4.13 5 0.51 0.413 4.36 0.58 1.21 3.9 3.6 2.17 30 1 3.75 1.14 547 6 0.44 0.032 3.18 3 0.47 0.0019 2.96 0.55 0.16 3.4 3.41 0.854 363 10 3.181 1.28 249 7 0.41 0.000 3.02 3 0.44 0.0274 2.83 0.52 0.04 3.2 3.155 0.388 303 10 3.181 1.28 249 8 0.39 0.014 3.12 3 0.42 0.055 3.23 0.48 0 3 3.6 0.873 215 1000 3.284 1.52 760 9 0.53 1.37 4.17 3 0.47 0.547 3.74 0.7 2.56 4.6 3.292 0.915 65 1000 3.284 1.52 760 10 0.57 0.727 2.85 2 0.63 1.97 3.4 0.67 0.09 2.3 3.5 2.25 2 1000 3.284 1.52 760 11 0.55 0 5 5 0.6 0 5 0.64 0 5 5 0 49 1000 3.284 1.52 76 12 0.61 1.18 3.92 5 0.61 0.59 4.23 0.73 1.96 3.6 4.122 0.825 156 1000 3.284 1.52 760 13 0.82 4.3 2.93 5 0.75 2.72 3.35 0.99 6.25 2.5 4.387 1.59 31 10000 2.902 1.18 215 14 0.79 0.071 3.73 4 0.72 0.0045 4.07 0.97 0.36 3.4 4.262 0.879 172 10000 2.902 1.18 215 15 0.77 0.001 4.02 4 0.7 0.124 3.65 0.94 0.16 4.4 3.288 1.07 191 10001 3.804 1.61 158 16 0.74 0.025 4.16 4 0.69 0.266 4.52 0.91 0.04 3.8 4.189 0.718 280 10001 3.804 1.61 158 17 0.74 0.341 4.58 4 0.68 0.218 4.47 0.9 0.49 4.7 3.605 1.42 276 10001 3.804 1.61 158 18 0.72 0.075 3.73 4 0.69 0.717 3.15 0.88 0.09 4.3 2.538 1.02 26 10001 3.804 1.61 158 19 0.70 0.033 3.82 4 0.68 0.193 4.44 0.87 0.64 3.2 4.218 0.316 55 10001 3.804 1.61 158 20 0.74 1.71 3.69 5 0.68 0.38 4.38 0.96 4 3 4.5 0.595 58 10002 3.441 1.73 2624 21 0.79 2.07 2.56 4 0.68 0.61 3.22 1 4.41 1.9 3.529 1.66 34 10002 3.441 1.73 2624 22 0.77 0.002 4.96 5 0.67 0.0020 4.96 1 0.002 4.96 4.955 0.085 669 10002 3.441 1.73 2624 23 0.76 0.066 4.74 5 0.66 0.262 4.49 1 0 5 4.231 0.789 121 10002 3.441 1.73 2624 24 0.76 0.684 2.17 3 0.65 0.206 2.55 1 1.44 1.8 3.615 0.786 499 10003 2.467 1.54 107 25 0.75 0.042 4.8 5 0.64 0.0000 4.99 0.99 0.16 4.6 4.544 0.476 193 10004 4.209 0.911 4021 26 0.77 1.45 3.8 5 0.67 1.45 3.79 1 1.44 3.8 3.545 1.16 55 10004 4.209 0.911 4021 27 0.76 0.371 3.61 3 0.66 0.0474 3.22 1 1 4 3.796 0.965 162 10004 4.209 0.911 4021 1 0.49 0.243 3.49 3 0.39 0.149 3.39 0.6 0.36 3.6 3.387 0.454 194 10091 2.686 1.27 207 2 0.85 1.2 3.1 2 0.62 0.627 2.79 1.1 1.96 3.4 3.364 0.959 11 10091 2.686 1.27 207 3 0.98 1.45 3.2 2 0.73 0.824 2.91 1.2 2.25 3.5 3.616 1.17 593 10091 2.686 1.27 207 4 0.85 0.0254 3.16 3 0.68 0.269 3.52 1.1 0.04 2.8 4.05 0.947 20 10091 2.686 1.27 207 5 0.76 0.00337 3.94 4 0.61 0.0135 3.88 0.96 0 4 3.424 0.679 92 10092 4.012 0.888 849 6 0.71 0.0637 4.25 4 0.57 0.042 4.2 0.89 0.09 4.3 3.203 0.943 64 10092 4.012 0.888 849 7 0.66 0.0224 3.85 4 0.61 0.639 3.2 0.84 0.25 4.5 3.492 0.77 177 10092 4.012 0.888 849 8 0.61 0.00687 3.92 4 0.57 0.0275 3.83 0.79 0 4 3.838 0.489 68 10092 4.012 0.888 849 9 0.61 0.334 4.42 5 0.61 0.732 4.14 0.75 0.09 4.7 3.218 1.2 156 10092 4.012 0.888 849 10 0.58 0.00114 2.97 3 0.68 1.28 4.13 0.8 1.44 1.8 3.7 1.01 20 10094 3.546 0.927 9779 11 0.66 1.43 3.81 5 0.73 1.18 3.91 0.86 1.69 3.7 3.593 1.22 518 10094 3.546 0.927 9779 12 0.63 0.00174 3.96 4 0.69 0.0003 4.02 0.83 0.01 3.9 4.187 0.676 904 10094 3.546 0.927 9779 The one improvement signal I find is: When Supp(u) < uThreshold subsitute prd_m for prd_u (assuming Supp(m) > mThreshold). It would be interesting to look at the correlation between Supp(u) and SqEr(u) (expect very high negative correlation?)

  35. m\u pcards mQ|M, need UUm(QSuppmTSuppm , n1pC2) c0 0 1 1 0 0 d0 1 0 0 1 0 e0 1 0 0 0 0 f0 1 0 1 0 1 c2 1 0 0 0 0 d2 0 1 1 1 1 e2 0 0 0 0 0 f2 1 0 1 0 1 c1 0 1 1 0 0 d1 0 0 0 0 0 e1 0 0 0 0 0 f1 0 0 0 0 0 n1pC2u,v  (v-u)o(v-u)/n - [v-u]2 n=|TCoSuppu,v| 10 0 1 1 1 Q|M = {2,5,6,8} 12 1 0 0 1 11 0 0 0 0 20 1 0 0 0 22 0 1 0 0 21 1 0 0 0 m=2Q|M, UU2(QSupp2TSupp2 , n1pC2) = ({e,f}{c,d}, n1pC2) 50 1 0 0 1 52 0 1 0 1 51 1 0 0 0 60 0 1 0 0 62 0 1 0 0 61 0 0 0 0 80 0 0 0 1 82 0 1 0 1 81 0 0 0 0 n1pC2e,c  (c-e)o(c-e)/n - [c-e]2 n=|TCoSuppe,c|=|{1}|=1 n1pC2e,c  (c-e)o(c-e) - [c-e]2 = (4-1)o(4-1) - [4-1]2 = 9-9 = 0 0 T(U, M,r) c 2 3 c 1 4 c 5 3 d 6 5 d 2 4 d 1 1 d 5 4 d 8 4 e 1 1 f 5 5 f 1 5 f 8 5 Q(M,U) 2 e 2 f 5 e 6 c 6 f 8 c 8 e n1pC2f,c  (c-f)o(c-f)/n - [c-f]2 n=|TCoSuppf,c|=|{1,5}|=2 c-f = (4,3)-(5,5) = (-1,-2) c-f = -3/2 n1pC2f,c  (c-f)o(c-f)/2 - [c-f]2 = 5/2 - [-3/2]2 = 2.5 - 2.25 = .25 .25 n1pC2f,d  (d-f)o(d-f)/n - [d-f]2 n=|TCoSuppf,d|=|{1,5,8}|=3 d-f=(1,4,4)-(5,5,5)=(-4,-1,-1) d-f = -2 n1pC2f,d  (d-f)o(d-f)/3 - [d-f]2 = 18/3 - [-2]2 = 6 - 4 = 2 2 Note1: n1pC2c,d = 3.56 but what if movies 2 and 5 are the same genre but 1 is totally different genre? It's a pure shifted signal in the {2,5} genre! n1pC2e,d(d-e)o(d-e)/n-[d-e]2 n=|TCSe,d|=|{1}|=1 d-e=1-1=0 d-e=0 n1pC2e,d  (d-e)o(d-e) - [d-e]2 = 0 - [0]2 = 0 - 0 = 0 0 Note2: n1pC2c,d should not be calculated from Ptrees (no sum over all u or all m involved). It should be calculated from T values directly for best efficiency. Computation of a prediction, however, does involve a sum over a mask, and therefore can be efficiently done with Ptrees! m=5Q|M, UU5(QSupp5TSupp5 , n1pC2) = ({e}{c,d,f}, n1pC2) n1pC2e,d done n1pC2e,c done n1pC2e,f (f-e)o(f-e)/n-[f-e]2 n=|TCSe,f|=|{1}|=1 f-e= 5-1= 4 f-e=4 n1pC2e,f  (f-e)o(f-e) - [f-e]2 = 16 - [4]2 = 16 - 16 = 0 0 TSM c d e f n1pC2f,d done m=6Q|M, UU6(QS6TS6 , n1pC2) = ({c,f}{d}, n1pC2) QSM c e f |TCSc,d|=|{1,2,5}|=3 d-c=(1,4,4)-(4,3,3)=(-3,1,1) d-c=-1/3 n1pC2c,d(d-c)o(d-c)/3 - [d-c]2 = 11/3 - 1/9= 32/9 = 3.56 3.56 m=8Q|M, UU8(QS8TS8 , n1pC2) = ({c,e}{d,f}, n1pC2) n1pC2c,d done n1pC2e,d done n1pC2e,f done actually, n1pC2c,f done |TCSc,f|=|{1,5}|=2 f-c=(5,5)-(4,3)=(1,2) f-c=-3/2 n1pC2c,f(f-c)o(f-c)/2 - [f-c]2 = 5/2 - 9/4= 2.5 - 2.25 = .25 .25

  36. TSM c d e f QSM c e f n1pC2u,v(v-u)o(v-u)/n-[v-u]2 n=|Top2mTCSu,v| mQ|M; find Top2m; UUm(QSmTSm , n1pC2Top2mCoSupp) Q|M={2,5,6,8} m=2Q|M, UU2= ({e,f}{c,d}, n1pC2) n1pC2e,c  (c-e)o(c-e)/n - [c-e]2 n=|TCSe,c|=|{1}|=1 n1pC2e,c(c-e)o(c-e)-[c-e]2 = (4-1)o(4-1)-[4-1]2 =9-9=0 0 n1pC2f,c(c-f)o(c-f)/n-[c-f]2 n=|TCSf,c|=|{1,5}|=2 c-f=(4,3)-(5,5)=(-1,-2) c-f = -3/2 n1pC2f,c  (c-f)o(c-f)/2 - [c-f]2 = 5/2 - [-3/2]2 = 2.5 - 2.25 = .25 .25 n1pC2f,d(d-f)o(d-f)/n-[d-f]2 n=|1,5,8|=3 d-f=(1,4,4)-(5,5,5)=(-4,-1,-1) d-f=-2 n1pC2f,d(d-f)o(d-f)/3-[d-f]2 =18/3-[-2]2 =6-4=2 2 n1pC2e,d(d-e)o(d-e)-[d-e]2 =0-[0]2=0-0=0 0 n1pC2e,d(d-e)o(d-e)/n-[d-e]2 n=|{1}|=1 d-e=1-1=0 d-e=0 n1pC2e,c done n1pC2e,d done m=5Q|M, UU5(QSupp5TSupp5 , n1pC2) = ({e}{c,d,f}, n1pC2) n1pC2e,f(f-e)o(f-e)-[f-e]2=16-[4]2=16-16=0 0 n1pC2e,f(f-e)o(f-e)/n-[f-e]2 n=|{1}|=1 f-e=5-1=4 f-e=4 Note1: n1pC2c,d = 3.56 but what if movies 2 and 5 are the same genre but 1 is totally different genre? It's a pure shifted signal in the {2,5} genre! n1pC2f,d done m=6Q|M, UU6(QS6TS6 , n1pC2) = ({c,f}{d}, n1pC2) |TCSc,dTop26|=|{1,2,5}{2,5}|=2 d-c=(4,4)-(3,3)=(1,1) d-c= 1 n1pC2c,d(d-c)o(d-c)/2-[d-c]2=2/2-1=0=0 0 m=8Q|M, UU8(QS8TS8 , n1pC2) = ({c,e}{d,f}, n1pC2) n1pC2c,d done n1pC2e,d done n1pC2e,f done n1pC2c,f(f-c)o(f-c)/2-[f-c]2=5/2-9/4=2.5-2.25=.25 .25 actually, n1pC2c,f done |TCSc,f|=|{1,5}|=2 f-c=(5,5)-(4,3)=(1,2) f-c=-3/2 mQSU mPCorru,vexp-((v-u)o(v-u)TSQSm/|TSQSm| - v-uTSQSm2 ) =e -(av(|v-u|2) -av2|u-v|)vTSm =exp-(avTSQSm(|v-u|2) - |avTSQSm(u-v)|2) Now, can Jenny's TopK can be used: mQSU uQSm find Top10 mPCorru,v vTSm It would also be great if these computations can be batched with respect to the v's (combined by bitslice?). Caution: The UUTbl entry for a given u,v pair now depends upon the order of movie processing (scanning of QSU )! What's the best order? Does it matter? My best guess at this point is: decreasing order of Suppm ??? And, possibly, rather than just skipping ahead if a mPCorru,v is already in UUTbl; if the existing mPCorru,v is higher than a threshold, compute new one and replace (assuming it is lower)? (However, if QSuppU is scanned in decreasing order of Suppm , not likely?)

  37. Possible improvements: uu1PerpCorr compute time to be const Reuse uu values 0.16 0.14 0.12 0.35 0.1 0.3 0.08 1:1,2,3,……..100 2:1,2,3…… 100 3: 0.25 0.06 0.04 0.2 0.02 0.15 0 0.1 1 30 59 88 117 146 175 204 233 262 291 320 349 378 407 436 465 494 0.05 0 1 27 53 79 105 131 157 183 209 235 261 287 313 339 365 391 417 443 469 495 sec sec u\m m\u table Ptree Load(read) order u\m m\u table Ptree Load(read) order 1 bit PerpC Comp Time UUTbl comp approach • For each movie m in Q set (aprox. 17K) • get the user sup list in Tm (Ui) • Read existing files for Ui and fill UixUi table • Compute the missing values for UixUi table • Write UixUi values as files of filename Ui: • repeat Req. number of computations/storage ? TotComp=0; For 1st movie in Q get the user sup list S1 = T1 TotComp = [CNT(S1)]2 mQ, Get user sup list Sm = Tm Comp=[CNT(Sm)–CNT(Sm^ S1) -..CNT(Sm^ Sn)… CNT(Sm^ Sm-1)]2 TotComp = TotComp + comp uuTable storage U1:{u1|0.2}{u5|0.1}{u9|0.4} U5:{u1|0.2}{u9|0.4} U9:{u5|0.1}{u9|0.4} U11:{u1|0.2}{u5|0.1}{u25|0.4} Req time to create First 100 UUTbl: Avg time for 1 user=3.45 min. TotalTime=3.45*480189=1.6x106=1150 days • On 5 processors = 230 days !

  38. essentially this is Q sorted on M UVTbl(MQ,UQ, VT,1pC(uQ,vT)) S1RV um= vUu.m, nMum,vPCorrm,n*1pCv,u * (vm- v + u ) vUu,m nMum,vPCorrm,n *1pCv,u A better UUTbl?? A better Top10?? Right now we are using the vector space CoSuppu,v (CSu,v)for calculations of 1pCu,v, then including a PearsonCorrm,n (PnCu,v) factor when computing the prediction. MY thought is: Would it be better to use, as the vector space (when looking for the Top10 v's for a um from Q), the TopK movies to m (then we wouldn't require that Suppv contain all TopK mvoies to qualify to be in the Top10 users to u). I think that would give us a different top10 v's and a better set (easier to find, since the vector space never changes until we change to a different m - and a better signal since the movies would all be the most similar to m). The only caution would be we would want to recognize and eliminate from Top10 candidacy, exact match v's which have all zero ratings in that vector space (v doesn't rate any of the TopK movies - which would 1pC=0 iff u rates all TopK movies the same). Other rating=0 problems would automatically be taken care of, since those v's would not make Top10? Would the problem be taken care of by just checking one time if Supp-u intersects TopK, because then a pure shifted signal to u couldn't be all zeros?). Maybe we would want to choose K so that Supp-u intersects TopK-m at least 3 movies? (or some other threshold - in other words, keep increasing K until we get enough overlap with Supp-u?) then check if those ratings are all the same, if so, check for pure zero on all v as Top10 candidates?.

  39. U\M 1 2 5 6 8 c 4 3 3 d 1 4 4 5 e 1 f 5 5 5 cQ Q d eQ Q Q fQ Q Tcdef U\M 1 2 5 6 8 U\M 1 2 5 6 8 U\M 1 2 5 6 8 Q c e f c 4 3 3 d 1 4 4 5 e 1 f 5 5 5 c 4 3 3 d 1 4 4 5 e 1 f 5 5 5 c 4 3 3 d 1 4 4 5 e 1 f 5 5 5 Tcdef Tcdef Tcdef Q c e f Q c e f Q c e f 0 4 2.25 .25 6.25 8 6.25 0 6.25 4 .25 4 .25 mQ|M, need UUm(QSmTSm , n1pC2) in this slide, n1pC2 over Top2m wrt |exact|/CoSup, tiebreak=CoSup n1pC2u,v(v-u)o(v-u)/2 -[v-u]2 Q|M={2,5,6,8} m=2Q|M, UU2(QS2TS2={e,f}{c,d}, n1pC2) 0=Cor2,1over{c,d} 1=Cor2,5over{{c,d} 0=Cor2,6over{d} Cor2,8=undef Top22={1,5} PROBLEM? n1pC2e,c(c-e)o(c-e)/2-[c-e]2= 0 (3,3)o(3,3)/2-[7/2-1/2]2=9-9= 0 n1pC2f,d(d-f)o(d-f)/2-[d-f]2= (-4,-1)o(-4,-1)/2-[5/2-5]2=17/2-25/4= 2.25 2.25 n1pC2e,d(d-e)o(d-e)/2-[d-e]2=(0, 4)o(0,4)/2-[5/2-1/2]2=8-4=4 4 n1pC2f,c(c-f)o(c-f)/2-[c-f]2=(-1,-2)o(-1,-2)/2-[7/2-5]2=5/2-9/4= .25 .25 m=5Q|M, UU5(QS5TS5={e}{c,d,f}, n1pC2) Corr5,1=.33 Corr5,2=1 Corr5,6=0, Corr5,8=1 Top25={2,8} n1pC2e,c(c-e)o(c-e)/2-[c-e]2=(3,0)o(3,0)/2-[3/2-0]2=9-9/4=6.25 6.25 n1pC2e,d(d-e)o(d-e)/2-[d-e]2= (4,0)o(4,0)/2-[2-0]2=16-4 = 8 8 n1pC2e,f(f-e)o(f-e)/2-[f-e]2=(0,5)o(0,5)/2-[5/2-0]2=25/2-25/4=6.25 6.25 m=6UU6({c,f}{d}, n1pC2) Cr6,1=0 Cr6,2=0 Cr6,5=0 Cr6,8=und 1,2,5 CoSupps = 1 tie break? min rating diff! so Top26={2,5} n1pC2c,d(d-c)o(d-c)/2-[d-c]2= (1,1)o(1,1)/2-[4-3]2=1-1= 0 0 n1pC2f,d(d-f)o(d-f)/2-[d-f]2= (-4,1)o(-4,1)/2-[4-5/2]2=17/2-9/4= 6.25 6.25 problem: due to r=0 taken as rating and getting a 0 1pC2 (unGaussianed). It may be no problem - no rating may correlated with don't like (rated low if it were rated), but in all cases some of the 0s were rated but occur in Q!! m=8Q|M, UU8(QS8TS8={c,e}{d,f}, n1pC2) Corr8,1=1 Corr8,2=und Corr8,5=1 Corr8,6=und Top28={1,5} n1pC2c,d(d-c)o(d-c)/2-[d-c]2= (-3,1)o(-3,1)/2-[5/2-7/2]2=5-1= 4 4 n1pC2c,f(f-c)o(f-c)/2-[f-c]2= (1,2)o(1,2)/2-[5-7/2]2=5/2-9/4= .25 .25 Eliminate them? We can remedy by taking TopKm in CoSuppm,n (or better, choose K depending upon the PearsonCorrm,n of these). n1pC2e,f(f-e)o(f-e)/2-[f-e]2= (4,5)o(4,5)/2-[5-1/2]2=41/2-81/4= .25 .25 Caution: in Toy, most 0's are in Q (78%) whereas in Netflix only .012% of the zeros are in Q. Therefore in Netfilx 0 almost always means truly not rated! n1pC2e,d(d-e)o(d-e)/2-[d-e]2= (0,4)o(0,4)/2-[5/2-1/2]2=8-4= 4 4 NOTE: If we systematically remove all "not-rated" (both u and v) we eliminate all the high n1pC2s! I will develop this algorithm on next slides

  40. Q|M={2,5,6,8} Strategy: Whenever a NoRating=0 drop that movie! and use hi bit exit. U\M 1 2 5 6 8 c 4 3 3 d 1 4 4 5 e 1 f 5 5 5 cQ Q d eQ Q Q fQ Q Tcdef U\M 1 2 5 6 8 U\M 1 2 5 6 8 U\M 1 2 5 6 8 U\M 1 2 5 6 8 Q c e f c 4 3 3 d 1 4 4 5 e 1 f 5 5 5 c 4 3 3 d 1 4 4 5 e 1 f 5 5 5 c 4 3 3 d 1 4 4 5 e 1 f 5 5 5 c 4 3 3 d 1 4 4 5 e 1 f 5 5 5 Tcdef Tcdef Tcdef Q c e f Q c e f Q c e f 0 0 2.25 0 0 0 4 .25 0 0 m=2Q|M, UU2(QS2TS2={e,f}{c,d}, n1pC2) 0=Cor2,1over{c,d} 1=Cor2,5over{{c,d} 0=Cor2,6over{d} Cor2,8=undef Top22={1,5} n1pC2e,c(c-e)o(c-e)/|{1}|-[c-e]2=3-3=0 No computation (1 movie n1pC20) 0 n1pC2f,d(d-f)o(d-f)/2-[d-f]2= (-4,-1)o(-4,-1)/2-[5/2-5]2=17/2-25/4= 2.25 2.25 n1pC2e,d(d-e)o(d-e)/|{1}|-[d-e]2= 0-0=0 (no computation req.!) 0 n1pC2f,c(c-f)o(c-f)/|{1}|-[c-f]2= 0-0=0 (no computation required) 0 m=5Q|M, UU5(QS5TS5={e}{c,d,f}, n1pC2) Corr5,1=.33 Corr5,2=1 Corr5,6=0, Corr5,8=1 Top25={2,8} n1pC2e,c(c-e)o(c-e)/2-[c-e]2No rating signal! n1pC2e,d(d-e)o(d-e)/2-[d-e]2No rating signal! n1pC2e,f(f-e)o(f-e)/2-[f-e]2No rating signal! m=6UU6({c,f}{d}, n1pC2) Cr6,1=0 Cr6,2=0 Cr6,5=0 Cr6,8=und so Top26={2,5} n1pC2c,d(d-c)o(d-c)/2-[d-c]2= (1,1)o(1,1)/2-[4-3]2=1-1= 0 0 n1pC2f,d(d-f)o(d-f)/|{2}|-[d-f]2=(-1)(-1)-(4-5)2nocomp =0 0 m=8Q|M, UU8(QS8TS8={c,e}{d,f}, n1pC2) Corr8,1=1 Corr8,2=und Corr8,5=1 Corr8,6=und Top28={1,5} n1pC2c,d(d-c)o(d-c)/2-[d-c]2= (-3,1)o(-3,1)/2-[5/2-7/2]2=5-1= 4 4 n1pC2c,f(f-c)o(f-c)/2-[f-c]2= (1,2)o(1,2)/2-[5-7/2]2=5/2-9/4= .25 .25 n1pC2e,f(f-e)o(f-e)/|{1}|-[f-e]2 no computation required! = 0 0 next walk thru early exit using hi bit n1pC2e,d(d-e)o(d-e)/|{1}|-[d-e]2 (no computation required!=0 0

  41. U\M 1 2 5 6 8 c 4 3 3 d 1 4 4 5 e 1 f 5 5 5 cQ Q d eQ Q Q fQ Q Tcdef Q c e f Tcdef Tcdef Tcdef Q c e f Q c e f Q c e f mQ|M, need UUm(QSmTSm , n1pC2) in this slide, n1pC2 over Top2m wrt |exact|/CoSup, but in CoSuppu,v If CoSuppu,v doesn't cover any K Supp(Corm,n)s then lower K (if covers 0, use CoSuppu,v). n1pC2u,v(v-u)o(v-u)/2 -[v-u]2 Q|M={2,5,6,8} m=2, UU2({e,f}{c,d}, n1pC2) Cor2,1=0 over{c,d} Cor2,5=1 over {c,d} Cor2,6=0 over {d} Cor2,8=und n1pC2e,c(c-e)o(c-e)/|{1}|-[c-e]2= 0 (3)o(3)/1-[4-1]2=9-9= 0 n1pC2f,d(d-f)o(d-f)/|{1,5}|-[d-f]2= (-4,-1)o(-4,-1)/2-[5/2-5]2=17/2-25/4= 2.25 2.25 n1pC2e,d(d-e)o(d-e)/|{1}|-[d-e]2=(0)o(0)/1-[1-1]2=0-0=0 0 n1pC2f,c(c-f)o(c-f)/|{1,5}|-[c-f]2=(-1,-2)o(-1,-2)/2-[7/2-5]2=5/2-9/4= .25 .25 m=5UU5({e}{c,d,f}, n1pC2) Cor5,1=1.3 {c,d} Cor5,2=1 {c,d} Cor5,6=0 {d} Corr5,8={f} n1pC2e,c(c-e)o(c-e)/|{1}|-[c-e]2 (already computed) = 0 0 n1pC2e,d(d-e)o(d-e)/|{1}|-[d-e]2 (already computed) = 0 0 n1pC2e,f(f-e)o(f-e)/|{1}|-[f-e]2=(4)o(4)/1-[5-1]2 = 16-16 = 0 0 m=6UU6({c,f}{d}, n1pC2) Cr6,1=0 {d} Cr6,2=0 {d} Cr6,5=0 {d} Cr6,8=und n1pC2c,d(d-c)o(d-c)/|125|-[d-c]2= |-3,1,1|2/3-[1/3]2=11/3-1/9= 3.5 3.5 n1pC2f,d(d-f)o(d-f)/|15|-[d-f]2= |-4,-1|2/2-[5-5/2]2=17/2-25/4= 2.25 2.25 m=8UU8({c,e}{d,f}, n1pC2) Corr8,1=1 over {f} Corr8,2=und Corr8,5=1 {f} Corr8,6=und n1pC2c,d(d-c)o(d-c)/|125|-[d-c]2 (already done) = 3.5 3.5 n1pC2c,f(f-c)o(f-c)/|15|-[f-c]2= (1,2)o(1,2)/2-[5-7/2]2=5/2-9/4= .25 .25 n1pC2e,f(f-e)o(f-e)/|{1}|-[f-e]2 (already done) = 0 0 n1pC2e,d(d-e)o(d-e)/|{1}|-[d-e]2 (already done) = 0 0

  42. Heuristics to PruneOff v's that can't give lo 1pC2u,v values for uQm T(U, M,r) c 1 6 110 c 2 5 101 c 5 5 101 d 1 7 111 d 2 6 110 d 5 6 110 d 6 7 111 d 8 6 110 e 1 3 011 f 1 7 111 f 5 7 111 f 8 7 111 3 011 4 100 5 101 6 110 7 111 uQmpruneOff vTmif MidBitPatternu,v changes down CSu,v (CSu,vTopKm?) Note: if MidBit(un,vn) differs for some movie, n, that means the rating pair differs by at least 2 except for rating pairs (3,4) and (5,6). Of course for (3,6) and (3,7) MBs don't differ, (so wouldn't cause a pruning) but 3 is a rare rating. So the main problem is that (5,6)s cause pruning even tho we would not want them to. XOR with 1st  movie in CSu,v if ever a 1 appears, prune that v. c2 1 1 1 0 0 d2 1 1 1 1 1 e2 0 0 0 0 0 f2 1 0 1 0 1 For uQm, Sum down and divide each sum in Suppm by |Sv| to form a new horizontal bitvector XOR it with 1st one. Prune vSuppm if 0. 12 1 1 0 1 221 1 0 0 521 1 0 1 62 0 1 0 0 82 0 1 0 1 c d e f 1 6 7 3 7 2 5 6 Q Q 5 5 6 Q 7 6Q 7 Q 8Q 6 Q 7 c1 1 0 0 0 0 d1 1 1 1 1 1 e1 1 0 0 0 0 f1 1 0 1 0 1 111111 2101 0 0 5101 0 1 61 0 1 0 0 81 0 1 0 1 c0 0 1 1 0 0 d0 1 0 0 1 0 e0 1 0 0 0 0 f0 1 0 1 0 1 10 0 1 1 1 20 1 0 0 0 50 1 0 0 1 60 0 1 0 0 80 0 0 0 1

More Related