Monday, October 26, 2015

Implementation of K-NN approach take suitable example

PROGRAM

filename: b13.py

import csv
from math import sqrt
import random
from collections import Counter
from operator import itemgetter

dataset = []
training = []
test = []

def loaddata():
with open('iris.data','rb') as csvfile:
lines = csv.reader(csvfile)
dataset = list(lines)
random.shuffle(dataset)
for x in range(15):
training.append(dataset[x])
for x in range(15,20):
test.append(dataset[x])

def calculate():
for x in test:
distances = []
abc = []
for y in training:
d = pow(int(x[0])-int(y[0]),2)
distances.append([sqrt(d),y])
sorteddistance = sorted(distances,key=itemgetter(0))
topk = sorteddistance[:3]
for (a,b) in topk:
abc.append(b)
classes = Counter(t for(p,t) in abc)
print "Class for"+str(x)+':'+ max(classes, key=lambda cls:classes[cls])

def main():
loaddata()
calculate()

main()

filename: iris.data

10,A
20,A
30,A
40,A
50,A
60,B
70,B
80,B
90,B
100,B
110,C
120,C
130,C
140,C
150,c
22,A
62,B
92,B
11,A
124,C

OUTPUT

Amols-MacBook-Air:b13 Darwin$ python B13.py
Class for['40', 'A']:A
Class for['60', 'B']:B
Class for['92', 'B']:B
Class for['124', 'C']:C
Class for['90', 'B']:B

Implement Na ̈ Bayes for Concurrent/Distributed application. Approach should handle categorical and ıve continuous data.

PROGRAM

filename: b12.py

from __future__ import division
import math,os
from multiprocessing import Process,Queue
#probability for categorical attribute



def info(title):
    print title
    print 'module name:', __name__
    if hasattr(os, 'getppid'):  # only available on Unix
        print 'parent process:', os.getppid()
    print 'process id:', os.getpid()

#print gaussian_dis(30,18,6)
#print categorical_prob(20,[12,10,10,13,14,32,43],len([12,20,10,13,14,32,43]))

def readfile(filename):
f = open(filename,"r")
dictionary = {}
dictionary['age']=[]
dictionary['income']=[]
dictionary['student']=[]
dictionary['credit_rating'] = []
dictionary['buys_computer'] = []
for line in f:
x = line.split(" ")
dictionary['age'].append(x[0])
dictionary['income'].append(x[1])
dictionary['student'].append(x[2])
dictionary['credit_rating'].append(x[3])
dictionary['buys_computer'].append(x[4].rstrip())
return dictionary



def getIncome(q):
    #income
    info('function getIncome')
    income_high_yes = 0
    income_high_no = 0
    income_low_no = 0
    income_low_yes = 0
    income_medium_yes = 0
    income_medium_no = 0
    len_yes=0
    len_no=0
    for x in xrange(tnor):
        if dicti['income'][x]=='high':
            if dicti['buys_computer'][x]=='yes':
                income_high_yes+=1
                len_yes+=1
            else:
                income_high_no+=1
                len_no+=1
        elif dicti['income'][x]=='low':
            if dicti['buys_computer'][x]=='yes':
                income_low_yes+=1
                len_yes+=1
            else:
                income_low_no+=1
                len_no+=1
        else:
            if dicti['buys_computer'][x]=='yes':
                income_medium_yes+=1
                len_yes+=1
            else:
                income_medium_no+=1
                len_no+=1
                
    income_high_yes_prob=income_high_yes/len_yes
    income_high_no_prob=income_high_no/len_no

    income_low_yes_prob=income_low_yes/len_yes
    income_low_no_prob=income_low_no/len_no

    income_medium_yes_prob = income_medium_yes/len_yes
    income_medium_no_prob = income_medium_no/len_no
    q.put([income_high_yes_prob,income_high_no_prob,income_low_yes_prob,income_low_no_prob,income_medium_yes_prob,income_medium_no_prob])


def getCreditRating(q):
    credit_fair_yes = 0
    credit_fair_no = 0
    credit_exc_yes = 0
    credit_exc_no = 0
    len_yes=0
    len_no=0
    for x in xrange(tnor):
        if dicti['credit_rating'][x]=='excellent':
            if dicti['buys_computer'][x]=='yes':
                    credit_exc_yes+=1
                    len_yes+=1
            else:
                    credit_exc_no+=1
                    len_no+=1    
        else:
            if dicti['buys_computer'][x]=='yes':
                    credit_fair_yes+=1
                    len_yes+=1
            else:
                    credit_fair_no+=1
                    len_no+=1    
    credit_fair_yes_prob = credit_fair_yes/len_yes
    credit_exec_yes_prob = credit_exc_yes/len_yes
    credit_fair_no_prob = credit_fair_no/len_no
    credit_exec_no_prob = credit_exc_no/len_no
    q.put([credit_fair_yes_prob,credit_fair_no_prob,credit_exec_yes_prob,credit_exec_no_prob])


def getStudentStatus(q):
    #student
    info('function getStudentStatus')
    studno_yes = 0
    studyes_yes = 0
    studno_no = 0
    studyes_no = 0
    len_yes=0
    len_no=0
    for x in xrange(tnor):
        if dicti['student'][x]=='no':
            if dicti['buys_computer'][x]=='yes':
                    studno_yes+=1
                    len_yes+=1
            else:
                    studno_no+=1
                    len_no+=1    
        else:
            if dicti['buys_computer'][x]=='yes':
                    studyes_yes+=1
                    len_yes+=1
            else:
                    studyes_no+=1
                    len_no+=1    
    studyes_yes_prob = studyes_yes/len_yes
    studyes_no_prob = studyes_no/len_no
    studno_yes_prob = studno_yes/len_yes
    studno_no_prob = studno_no/len_no
    q.put([studyes_yes_prob,studyes_no_prob,studno_yes_prob,studno_no_prob])

if __name__ == '__main__':
    dicti = readfile("data_nb.txt")
    print dicti

    #Total number of rows in the record
    tnor = len(dicti['buys_computer'])

    aprior_num_yes = 0
    for x in dicti['buys_computer']:
    if x=='yes':
    aprior_num_yes+=1

    print len(dicti['buys_computer'])
    aprior_yes = aprior_num_yes/len(dicti['buys_computer'])
    #print aprior_yes
    aprior_no = (len(dicti['buys_computer'])-aprior_yes)/len(dicti['buys_computer'])

    yes_total_prob = 1
    no_total_prob = 1

    info('main line')
    q = Queue()
    p = []
    p1 = Process(target=getIncome, args=(q,))
    p2 = Process(target=getStudentStatus,args=(q,))
    p3 = Process(target=getCreditRating,args=(q,))
    p.append(p1)
    p.append(p2)
    p.append(p3)
    process_output = []
    for x in p:
        x.start()
        process_output.append(q.get())
    for x in p:
        x.join()
    income_high_yes_prob = process_output[0][0]
    income_high_no_prob = process_output[0][1]
    income_medium_yes_prob = process_output[0][2]
    income_medium_no_prob = process_output[0][3]
    income_low_yes_prob = process_output[0][4]
    income_low_no_prob = process_output[0][5]

    studyes_yes_prob = process_output[1][0]
    studyes_no_prob = process_output[1][1]
    studno_yes_prob = process_output[1][2]
    studno_no_prob = process_output[1][3]

    credit_fair_yes_prob = process_output[2][0] 
    credit_fair_no_prob = process_output[2][1]
    credit_exec_yes_prob = process_output[2][2]
    credit_exec_no_prob = process_output[2][3]


    income_x = raw_input("Enter the income:\n>")
    credit_x = raw_input("Enter the credit rating:\n>")
    stud_x = raw_input("Enter the student status:\n>")

    if income_x=='high':
        yes_total_prob*=income_high_yes_prob
        no_total_prob*=income_high_no_prob
    elif income_x=='medium':
        yes_total_prob*=income_medium_yes_prob
        no_total_prob*=income_medium_no_prob
    else:
        yes_total_prob*=income_low_yes_prob
        no_total_prob*=income_low_no_prob

    if credit_x=='fair':
        yes_total_prob*=credit_fair_yes_prob
        no_total_prob*=credit_fair_no_prob
    else:
        yes_total_prob*=credit_exec_yes_prob
        no_total_prob*=credit_exec_no_prob    

    if stud_x=='yes':
        yes_total_prob*=studyes_yes_prob
        no_total_prob*=studyes_no_prob
    else:
        yes_total_prob*=studno_yes_prob
        no_total_prob*=studno_no_prob



    print "No Probablility: "+str(no_total_prob)
    print "Yes Probability: "+str(yes_total_prob)
    if no_total_prob>yes_total_prob:
        print "NO"
    else:
        print "YES"    
        

filename: data_nb.txt

10 high no fair no
12 high no excellent no
32 high no fair yes
64 medium no fair yes
65 low yes fair yes
60 low yes excellent no
37 low yes excellent yes
18 medium no fair no
16 low yes fair yes
69 medium yes fair yes
15 medium yes excellent yes
36 medium no excellent yes
31 high yes fair yes
70 medium no excellent no

OUTPUT

Amols-MacBook-Air:b12 Darwin$ python b12.py
{'credit_rating': ['fair', 'excellent', 'fair', 'fair', 'fair', 'excellent', 'excellent', 'fair', 'fair', 'fair', 'excellent', 'excellent', 'fair', 'excellent'], 'buys_computer': ['no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no'], 'age': ['10', '12', '32', '64', '65', '60', '37', '18', '16', '69', '15', '36', '31', '70'], 'student': ['no', 'no', 'no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no'], 'income': ['high', 'high', 'high', 'medium', 'low', 'low', 'low', 'medium', 'low', 'medium', 'medium', 'medium', 'high', 'medium']}
14
main line
module name: __main__
parent process: 4406
process id: 4594
function getIncome
module name: __main__
parent process: 4594
process id: 4595
function getStudentStatus
module name: __main__
parent process: 4594
process id: 4596
Enter the income:
>high
Enter the credit rating:
>fair
Enter the student status:
>no
No Probablility: 0.128
Yes Probability: 0.0493827160494
NO

Implement Apriori approach for datamining to organize the data items on a shelf using the dataset.csv file of items purchased in a Mall given below



PROGRAM

filename: apriori.py

from collections import defaultdict

def readFile(fn):
f = open(fn,"r")
for line in f:
line = line.strip().rstrip(",")
record = frozenset(line.split(','))
yield record

def returnItemSetTL(data_iter):
_itemSet = set()
TransactionList = []
for record in data_iter:
trans = frozenset(record)
TransactionList.append(trans)
for item in trans:
_itemSet.add(frozenset([item]))
return _itemSet,TransactionList

def returnItemWithMinSupport(itemSet,TransactionList,ms,freqSet):
localSet = defaultdict(int)
_itemSet = set()
for item in itemSet:
for trans in TransactionList:
if item.issubset(trans):
localSet[item]+=1
freqSet[item]+=1
for item,count in localSet.items():
support = float(count)/float(len(TransactionList))
if support>=minSupport:
_itemSet.add(item)
return _itemSet

def joinSet(itemSet,length):
return set([i.union(j) for i in itemSet for j in itemSet if len(i.union(j))==length])

def getSupport(freqSet,item,TransactionList):
return float(freqSet[item])/float(len(TransactionList))

def runApriori(data_iter,ms,mc):
itemSet,TransactionList = returnItemSetTL(data_iter)

freqSet = defaultdict(int)
largeSet = {}
oneCSet = returnItemWithMinSupport(itemSet,TransactionList,ms,freqSet)
k=2
currentLSet = oneCSet
print "L1"
print currentLSet
while(currentLSet!=set([])):
largeSet[k-1]=currentLSet
currentLSet = joinSet(currentLSet,k)
print "L",k
print currentLSet
currentCSet = returnItemWithMinSupport(currentLSet,TransactionList,ms,freqSet)
currentLSet = currentCSet
k+=1
items = []
for key,value in largeSet.items():
items.extend([(tuple(item),getSupport(freqSet,item,TransactionList)) for item in value])
print items
max_value = 0
for item,support in sorted(items,key=lambda(item,support):support):
if max_value<len(item):
max_value=len(item)
for item,support in sorted(items,key=lambda(item,support):support):
if max_value==len(item):
print item


if __name__=='__main__':
inFile = readFile("dataset.csv")
minSupport = 0.6
minConfidence = 1
runApriori(inFile,minSupport,minConfidence)


filename: dataset.csv

Mango,Onion,Jar,Keychain,Eggs,Chocolate
Nuts,Onion,Jar,Keychain,Eggs,Chocolate
Mango,Apple,Keychain,Eggs
Mango,Toothbrush,Corn,Keychain,Chocolate
Corn,Onion,Onion,Keychain,Knife,Eggs

OUTPUT

Amols-MacBook-Air:b10 Darwin$ python apriori.py
L1
set([frozenset(['Onion']), frozenset(['Chocolate']), frozenset(['Keychain']), frozenset(['Eggs']), frozenset(['Mango'])])
L 2
set([frozenset(['Mango', 'Keychain']), frozenset(['Eggs', 'Onion']), frozenset(['Onion', 'Chocolate']), frozenset(['Mango', 'Chocolate']), frozenset(['Keychain', 'Chocolate']), frozenset(['Eggs', 'Chocolate']), frozenset(['Keychain', 'Onion']), frozenset(['Mango', 'Onion']), frozenset(['Eggs', 'Keychain']), frozenset(['Eggs', 'Mango'])])
L 3
set([frozenset(['Eggs', 'Keychain', 'Onion']), frozenset(['Eggs', 'Keychain', 'Chocolate']), frozenset(['Mango', 'Keychain', 'Chocolate']), frozenset(['Mango', 'Keychain', 'Onion']), frozenset(['Eggs', 'Mango', 'Keychain']), frozenset(['Keychain', 'Onion', 'Chocolate'])])
L 4
set([])
[(('Onion',), 0.6), (('Chocolate',), 0.6), (('Keychain',), 1.0), (('Eggs',), 0.8), (('Mango',), 0.6), (('Eggs', 'Keychain'), 0.8), (('Mango', 'Keychain'), 0.6), (('Eggs', 'Onion'), 0.6), (('Keychain', 'Onion'), 0.6), (('Keychain', 'Chocolate'), 0.6), (('Eggs', 'Keychain', 'Onion'), 0.6)]
('Eggs', 'Keychain', 'Onion')

Using any similarity based techniques develop an application to classify text data. Perform pre-processing tasks as per requirement

PROGRAM

filename: b9.py

from math import sqrt
from math import log
from collections import Counter
from operator import itemgetter

def tf(kt,doc):
return (doc.count(kt))

def idf(kt,all_docs):
num=0
for x in all_docs:
if kt in x:
num=num+1
if num>0:
return round(float(log(float(len(all_docs))/float(num))),3)
else:
return 0

def tfidf(kt,doc):
return (tf(kt,doc)*idf(kt,all_docs))

def cos_sim(infile,docs,ktrms):
a=0
for x in ktrms:
a=a+tfidf(x,infile)*tfidf(x,docs)
b=doclen(infile,ktrms)*doclen(docs,ktrms)
if not b:
return 0
else:
return (round((a/b),3))

def doclen(doc,ktrms):
val=0
for x in ktrms:
val=val+pow(tfidf(x,doc),2)
return sqrt(val)

files=[]
all_docs=[]
key_terms=[]

documents=['doc1.txt','doc2.txt','doc3.txt','doc4.txt','doc5.txt','doc6.txt']
result=[['doc1.txt','animals'],['doc2.txt','animals'],['doc3.txt','animals'],['doc4.txt','sports'],['doc5.txt','sports'],['doc6.txt','sports']]

for x in documents:
files.append(open(x,'r').read())

for x in files:
all_docs.append(x.lower().rstrip('\n'))

for x in all_docs:
key_terms=key_terms+x.split()
key_terms=set(key_terms)
key_terms=list(key_terms)

filename=raw_input("Enter test file: ")
inputfile=open(filename,'r').readline().lower()

cnt=0
for x in all_docs:
result[cnt]=result[cnt]+[cos_sim(inputfile,x,key_terms)]
cnt=cnt+1
print result

k=3
sortedresult=sorted(result,key=itemgetter(2),reverse=True)
top_k=sortedresult[:k]
top_k[:]=(x for x in top_k if x[2]!=0)
if len(top_k)==0:
print "Does not match"
else:
class_count=Counter(category for (document,category,value) in top_k)
print class_count
classification=max(class_count,key=lambda cls:class_count[cls])
print "Class of test file: ",classification

'''
TF: Term Frequency, which measures how frequently a term occurs in a document. Since every document is different in length, it is possible that a term would appear much more times in long documents than shorter ones. Thus, the term frequency is often divided by the document length (aka. the total number of terms in the document) as a way of normalization: 
TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document).

IDF: Inverse Document Frequency, which measures how important a term is. While computing TF, all terms are considered equally important. However it is known that certain terms, such as "is", "of", and "that", may appear a lot of times but have little importance. Thus we need to weigh down the frequent terms while scale up the rare ones, by computing the following: 
IDF(t) = log_e(Total number of documents / Number of documents with term t in it).
'''

filename: doc1.txt
Animals live on land and water.Land animals include cats, cows.Water animals include all types of fishes.

filename: doc2.txt
Animals can be classified as herbivorous which is plant eating, carnivorous which is flesh eating and omnivorous which is eating both.

filename: doc3.txt
All land animals have two eyes and ears.They have one tail.

filename: doc4.txt
Sports are all forms of usually competitive activity which,through casual or organised participation, aim to use, maintain or improve physical ability and skills.

filename: doc5.txt
Different sport have different rules.

filename: doc6.txt
Depending on the sport, every sport has different number of players in each team.

filename: t.txt
Dog is an omnivorous creature that lives on land and has one tail.

filename: t2.txt
Football is a competitive sport with eleven players.

OUTPUT

Amols-Air:b9 Darwin$ python b9.py
Enter test file: t.txt
[['doc1.txt', 'animals', 0.249], ['doc2.txt', 'animals', 0.162], ['doc3.txt', 'animals', 0.388], ['doc4.txt', 'sports', 0.067], ['doc5.txt', 'sports', 0.011], ['doc6.txt', 'sports', 0.164]]
Counter({'animals': 2, 'sports': 1})
Class of test file:  animals

Implementing recursive descent parser for sample language

PROGRAM

filename: b7.py

"""
exp ::= term | exp + term | exp - term
term ::= factor | factor * term | factor / term
factor ::= number | ( exp )
"""
class Calculator():
    def __init__(self, tokens):
        self._tokens = tokens
        self._current = tokens[0]
    def exp(self):
        result = self.term()
        while self._current in ('+', '-'):
            if self._current == '+':
                self.next()
                result += self.term()
            if self._current == '-':
                self.next()
                result -= self.term()
        return result
    def factor(self):
        result = None
        if self._current[0].isdigit() or self._current[-1].isdigit():
            result = float(self._current)
            self.next()
        elif self._current is '(':
            self.next()
            result = self.exp()
            self.next()
        return result
    def next(self):
        self._tokens = self._tokens[1:]
        self._current = self._tokens[0] if len(self._tokens) > 0 else None
    def term(self):
        result = self.factor()
        while self._current in ('*', '/'):
            if self._current == '*':
                self.next()
                result *= self.term()
            if self._current == '/':
                self.next()
                result /= self.term()
        return result

if __name__ == '__main__':
    flag=0
    while True:
        lst = list(raw_input('> ').replace(' ', ''))
        tokens = []
        for i in range(len(lst)):
            if(i==len(lst)-1 and lst[i] in ('*','/','-','+')):
                print "Syntax Error String not accepted"
                flag=1
                break
            if lst[i].isdigit() and i > 0 and (tokens[-1].isdigit() or tokens[-1][-1] is '.'):
                tokens[-1] += lst[i]
            elif lst[i] is '.' and i > 0 and tokens[-1].isdigit():
                tokens[-1] += lst[i]
            else:
                tokens.append(lst[i])
            
        if flag!=1:       
            print Calculator(tokens).exp() 

OUTPUT

Amols-MacBook-Air:Codes Darwin$ python b7.py
> 5+8*9 
77.0
> 4/4
1.0
> 4+2+
Syntax Error String not accepted
> ^CTraceback (most recent call last):
  File "b7.py", line 47, in <module>
    lst = list(raw_input('> ').replace(' ', ''))
KeyboardInterrupt

Code optimization using DAG

PROGRAM

filename: b4.py

from sys import argv

mp = {}
list_statement = []
replace = {}
not_used = []

class ExternalNode():
    variable_identifier = ''
    def __init__(self, identifier):
        self.variable_identifier = identifier

class InternalNode():
    variable_identifier = ''
    operator_value = ''
    lchild = None
    rchild = None
    def __init__(self, val, l, r, id):
        self.operator_value = val
        self.lchild = l
        self.rchild = r
        self.variable_identifier = id

def read_input(file_name):
    file_var = open(file_name, 'r').read().strip().split("\n")
    for line in file_var:
        if len(line)>=6 and line[:6] == "return":
            break
        line = line.split(":=")
        value = line[0].strip()
        line [1] = line[1].strip().split(" ")
        first = line[1][0]
        operator = line[1][1]
        second = line[1][2]
        if (operator == '=') or (not first in mp):
            temp_var = ExternalNode(first)
            mp[first] = temp_var
            replace[first] = first
        if not second in mp:
            temp_var = ExternalNode(second)
            mp[second] = temp_var
            replace[second] = second

        list_statement.append(value)
        var = InternalNode(operator, mp[first], mp[second], value)
        mp[value] = var
        replace[value] = value

def print_code():
    for st in list_statement:
        if not st in not_used:
            print mp[st].variable_identifier + " = " + replace[mp[st].lchild.variable_identifier] + " " + mp[st].operator_value + " " + replace[mp[st].rchild.variable_identifier]

def optimise():
    for i in range(len(list_statement)):
        for j in range(i, len(list_statement)):
            st = list_statement[i]
            st2 = list_statement[j]
            if mp[st].lchild == mp[st2].lchild and mp[st].rchild == mp[st2].rchild and mp[st].operator_value == mp[st2].operator_value:
                replace[st2] = replace[st]
                if st2 != st:
                    not_used.append(st2)

if __name__ == "__main__":
    file_name = "intermediate_code"
    if len(argv) == 2:
        file_name = argv[1]
    read_input(file_name)
    optimise()
    print_code()

filename: intermediate_code

t1 := a + b
t2 := c * t1
t3 := a + b
t4 := d * t3
t5 := t2 + t4
t6 := r = t5
t7 := t3 + a
t0 := a = t4
t8 := a + b
t9 := a + b
t10 := t9 + a
t11 := a = l
t12 := a + b
t13 := a + b
t14 := j + t13


OUTPUT

Amols-Air:b4 Darwin$ python b4.py
t1 = a + b
t2 = c * t1
t4 = d * t1
t5 = t2 + t4
t6 = r = t5
t7 = t1 + a
t0 = a = t4
t8 = a + b
t10 = t8 + a
t11 = a = l
t12 = a + b
t14 = j + t12

Perform a suitable assignment using Xen Hypervisor or equivalent open source to configure it. Give necessary GUI.

 To install kvm on Fedora:  yum install kvm  yum install virt-manager libvirt libvirt-python python-virtinst  su -c "yum install @v...