{iCode}: October 2015

Monday, October 26, 2015

Implementation of K-NN approach take suitable example

PROGRAM

filename: b13.py

import csv
from math import sqrt
import random
from collections import Counter
from operator import itemgetter

dataset = []
training = []
test = []

def loaddata():
with open('iris.data','rb') as csvfile:
lines = csv.reader(csvfile)
dataset = list(lines)
random.shuffle(dataset)
for x in range(15):
training.append(dataset[x])
for x in range(15,20):
test.append(dataset[x])

def calculate():
for x in test:
distances = []
abc = []
for y in training:
d = pow(int(x[0])-int(y[0]),2)
distances.append([sqrt(d),y])
sorteddistance = sorted(distances,key=itemgetter(0))
topk = sorteddistance[:3]
for (a,b) in topk:
abc.append(b)
classes = Counter(t for(p,t) in abc)
print "Class for"+str(x)+':'+ max(classes, key=lambda cls:classes[cls])

def main():
loaddata()
calculate()

main()

filename: iris.data

10,A
20,A
30,A
40,A
50,A
60,B
70,B
80,B
90,B
100,B
110,C
120,C
130,C
140,C
150,c
22,A
62,B
92,B
11,A
124,C

OUTPUT

Amols-MacBook-Air:b13 Darwin$ python B13.py
Class for['40', 'A']:A
Class for['60', 'B']:B
Class for['92', 'B']:B
Class for['124', 'C']:C
Class for['90', 'B']:B

Implement Na ̈ Bayes for Concurrent/Distributed application. Approach should handle categorical and ıve continuous data.

PROGRAM

filename: b12.py

from __future__ import division
import math,os
from multiprocessing import Process,Queue
#probability for categorical attribute

def info(title):
print title
print 'module name:', __name__
if hasattr(os, 'getppid'): # only available on Unix
print 'parent process:', os.getppid()
print 'process id:', os.getpid()

#print gaussian_dis(30,18,6)
#print categorical_prob(20,[12,10,10,13,14,32,43],len([12,20,10,13,14,32,43]))

def readfile(filename):
f = open(filename,"r")
dictionary = {}
dictionary['age']=[]
dictionary['income']=[]
dictionary['student']=[]
dictionary['credit_rating'] = []
dictionary['buys_computer'] = []
for line in f:
x = line.split(" ")
dictionary['age'].append(x[0])
dictionary['income'].append(x[1])
dictionary['student'].append(x[2])
dictionary['credit_rating'].append(x[3])
dictionary['buys_computer'].append(x[4].rstrip())
return dictionary

def getIncome(q):
#income
info('function getIncome')
income_high_yes = 0
income_high_no = 0
income_low_no = 0
income_low_yes = 0
income_medium_yes = 0
income_medium_no = 0
len_yes=0
len_no=0
for x in xrange(tnor):
if dicti['income'][x]=='high':
if dicti['buys_computer'][x]=='yes':
income_high_yes+=1
len_yes+=1
else:
income_high_no+=1
len_no+=1
elif dicti['income'][x]=='low':
if dicti['buys_computer'][x]=='yes':
income_low_yes+=1
len_yes+=1
else:
income_low_no+=1
len_no+=1
else:
if dicti['buys_computer'][x]=='yes':
income_medium_yes+=1
len_yes+=1
else:
income_medium_no+=1
len_no+=1

income_high_yes_prob=income_high_yes/len_yes
income_high_no_prob=income_high_no/len_no

income_low_yes_prob=income_low_yes/len_yes
income_low_no_prob=income_low_no/len_no

income_medium_yes_prob = income_medium_yes/len_yes
income_medium_no_prob = income_medium_no/len_no
q.put([income_high_yes_prob,income_high_no_prob,income_low_yes_prob,income_low_no_prob,income_medium_yes_prob,income_medium_no_prob])

def getCreditRating(q):
credit_fair_yes = 0
credit_fair_no = 0
credit_exc_yes = 0
credit_exc_no = 0
len_yes=0
len_no=0
for x in xrange(tnor):
if dicti['credit_rating'][x]=='excellent':
if dicti['buys_computer'][x]=='yes':
credit_exc_yes+=1
len_yes+=1
else:
credit_exc_no+=1
len_no+=1
else:
if dicti['buys_computer'][x]=='yes':
credit_fair_yes+=1
len_yes+=1
else:
credit_fair_no+=1
len_no+=1
credit_fair_yes_prob = credit_fair_yes/len_yes
credit_exec_yes_prob = credit_exc_yes/len_yes
credit_fair_no_prob = credit_fair_no/len_no
credit_exec_no_prob = credit_exc_no/len_no
q.put([credit_fair_yes_prob,credit_fair_no_prob,credit_exec_yes_prob,credit_exec_no_prob])

def getStudentStatus(q):
#student
info('function getStudentStatus')
studno_yes = 0
studyes_yes = 0
studno_no = 0
studyes_no = 0
len_yes=0
len_no=0
for x in xrange(tnor):
if dicti['student'][x]=='no':
if dicti['buys_computer'][x]=='yes':
studno_yes+=1
len_yes+=1
else:
studno_no+=1
len_no+=1
else:
if dicti['buys_computer'][x]=='yes':
studyes_yes+=1
len_yes+=1
else:
studyes_no+=1
len_no+=1
studyes_yes_prob = studyes_yes/len_yes
studyes_no_prob = studyes_no/len_no
studno_yes_prob = studno_yes/len_yes
studno_no_prob = studno_no/len_no
q.put([studyes_yes_prob,studyes_no_prob,studno_yes_prob,studno_no_prob])

if __name__ == '__main__':
dicti = readfile("data_nb.txt")
print dicti

#Total number of rows in the record
tnor = len(dicti['buys_computer'])

aprior_num_yes = 0
for x in dicti['buys_computer']:
if x=='yes':
aprior_num_yes+=1

print len(dicti['buys_computer'])
aprior_yes = aprior_num_yes/len(dicti['buys_computer'])
#print aprior_yes
aprior_no = (len(dicti['buys_computer'])-aprior_yes)/len(dicti['buys_computer'])

yes_total_prob = 1
no_total_prob = 1

info('main line')
q = Queue()
p = []
p1 = Process(target=getIncome, args=(q,))
p2 = Process(target=getStudentStatus,args=(q,))
p3 = Process(target=getCreditRating,args=(q,))
p.append(p1)
p.append(p2)
p.append(p3)
process_output = []
for x in p:
x.start()
process_output.append(q.get())
for x in p:
x.join()
income_high_yes_prob = process_output[0][0]
income_high_no_prob = process_output[0][1]
income_medium_yes_prob = process_output[0][2]
income_medium_no_prob = process_output[0][3]
income_low_yes_prob = process_output[0][4]
income_low_no_prob = process_output[0][5]

studyes_yes_prob = process_output[1][0]
studyes_no_prob = process_output[1][1]
studno_yes_prob = process_output[1][2]
studno_no_prob = process_output[1][3]

credit_fair_yes_prob = process_output[2][0]
credit_fair_no_prob = process_output[2][1]
credit_exec_yes_prob = process_output[2][2]
credit_exec_no_prob = process_output[2][3]

income_x = raw_input("Enter the income:\n>")
credit_x = raw_input("Enter the credit rating:\n>")
stud_x = raw_input("Enter the student status:\n>")

if income_x=='high':
yes_total_prob*=income_high_yes_prob
no_total_prob*=income_high_no_prob
elif income_x=='medium':
yes_total_prob*=income_medium_yes_prob
no_total_prob*=income_medium_no_prob
else:
yes_total_prob*=income_low_yes_prob
no_total_prob*=income_low_no_prob

if credit_x=='fair':
yes_total_prob*=credit_fair_yes_prob
no_total_prob*=credit_fair_no_prob
else:
yes_total_prob*=credit_exec_yes_prob
no_total_prob*=credit_exec_no_prob

if stud_x=='yes':
yes_total_prob*=studyes_yes_prob
no_total_prob*=studyes_no_prob
else:
yes_total_prob*=studno_yes_prob
no_total_prob*=studno_no_prob

print "No Probablility: "+str(no_total_prob)
print "Yes Probability: "+str(yes_total_prob)
if no_total_prob>yes_total_prob:
print "NO"
else:
print "YES"

filename: data_nb.txt

10 high no fair no
12 high no excellent no
32 high no fair yes
64 medium no fair yes
65 low yes fair yes
60 low yes excellent no
37 low yes excellent yes
18 medium no fair no
16 low yes fair yes
69 medium yes fair yes
15 medium yes excellent yes
36 medium no excellent yes
31 high yes fair yes
70 medium no excellent no

OUTPUT

Amols-MacBook-Air:b12 Darwin$ python b12.py
{'credit_rating': ['fair', 'excellent', 'fair', 'fair', 'fair', 'excellent', 'excellent', 'fair', 'fair', 'fair', 'excellent', 'excellent', 'fair', 'excellent'], 'buys_computer': ['no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no'], 'age': ['10', '12', '32', '64', '65', '60', '37', '18', '16', '69', '15', '36', '31', '70'], 'student': ['no', 'no', 'no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no'], 'income': ['high', 'high', 'high', 'medium', 'low', 'low', 'low', 'medium', 'low', 'medium', 'medium', 'medium', 'high', 'medium']}
14
main line
module name: __main__
parent process: 4406
process id: 4594
function getIncome
module name: __main__
parent process: 4594
process id: 4595
function getStudentStatus
module name: __main__
parent process: 4594
process id: 4596
Enter the income:
>high
Enter the credit rating:
>fair
Enter the student status:
>no
No Probablility: 0.128
Yes Probability: 0.0493827160494
NO

Implement Apriori approach for datamining to organize the data items on a shelf using the dataset.csv file of items purchased in a Mall given below

PROGRAM

filename: apriori.py

from collections import defaultdict

def readFile(fn):
f = open(fn,"r")
for line in f:
line = line.strip().rstrip(",")
record = frozenset(line.split(','))
yield record

def returnItemSetTL(data_iter):
_itemSet = set()
TransactionList = []
for record in data_iter:
trans = frozenset(record)
TransactionList.append(trans)
for item in trans:
_itemSet.add(frozenset([item]))
return _itemSet,TransactionList

def returnItemWithMinSupport(itemSet,TransactionList,ms,freqSet):
localSet = defaultdict(int)
_itemSet = set()
for item in itemSet:
for trans in TransactionList:
if item.issubset(trans):
localSet[item]+=1
freqSet[item]+=1
for item,count in localSet.items():
support = float(count)/float(len(TransactionList))
if support>=minSupport:
_itemSet.add(item)
return _itemSet

def joinSet(itemSet,length):
return set([i.union(j) for i in itemSet for j in itemSet if len(i.union(j))==length])

def getSupport(freqSet,item,TransactionList):
return float(freqSet[item])/float(len(TransactionList))

def runApriori(data_iter,ms,mc):
itemSet,TransactionList = returnItemSetTL(data_iter)

freqSet = defaultdict(int)
largeSet = {}
oneCSet = returnItemWithMinSupport(itemSet,TransactionList,ms,freqSet)
k=2
currentLSet = oneCSet
print "L1"
print currentLSet
while(currentLSet!=set([])):
largeSet[k-1]=currentLSet
currentLSet = joinSet(currentLSet,k)
print "L",k
print currentLSet
currentCSet = returnItemWithMinSupport(currentLSet,TransactionList,ms,freqSet)
currentLSet = currentCSet
k+=1
items = []
for key,value in largeSet.items():
items.extend([(tuple(item),getSupport(freqSet,item,TransactionList)) for item in value])
print items
max_value = 0
for item,support in sorted(items,key=lambda(item,support):support):
if max_value<len(item):
max_value=len(item)
for item,support in sorted(items,key=lambda(item,support):support):
if max_value==len(item):
print item

if __name__=='__main__':
inFile = readFile("dataset.csv")
minSupport = 0.6
minConfidence = 1
runApriori(inFile,minSupport,minConfidence)

filename: dataset.csv

Mango,Onion,Jar,Keychain,Eggs,Chocolate
Nuts,Onion,Jar,Keychain,Eggs,Chocolate
Mango,Apple,Keychain,Eggs
Mango,Toothbrush,Corn,Keychain,Chocolate
Corn,Onion,Onion,Keychain,Knife,Eggs

OUTPUT

Amols-MacBook-Air:b10 Darwin$ python apriori.py
L1
set([frozenset(['Onion']), frozenset(['Chocolate']), frozenset(['Keychain']), frozenset(['Eggs']), frozenset(['Mango'])])
L 2
set([frozenset(['Mango', 'Keychain']), frozenset(['Eggs', 'Onion']), frozenset(['Onion', 'Chocolate']), frozenset(['Mango', 'Chocolate']), frozenset(['Keychain', 'Chocolate']), frozenset(['Eggs', 'Chocolate']), frozenset(['Keychain', 'Onion']), frozenset(['Mango', 'Onion']), frozenset(['Eggs', 'Keychain']), frozenset(['Eggs', 'Mango'])])
L 3
set([frozenset(['Eggs', 'Keychain', 'Onion']), frozenset(['Eggs', 'Keychain', 'Chocolate']), frozenset(['Mango', 'Keychain', 'Chocolate']), frozenset(['Mango', 'Keychain', 'Onion']), frozenset(['Eggs', 'Mango', 'Keychain']), frozenset(['Keychain', 'Onion', 'Chocolate'])])
L 4
set([])
[(('Onion',), 0.6), (('Chocolate',), 0.6), (('Keychain',), 1.0), (('Eggs',), 0.8), (('Mango',), 0.6), (('Eggs', 'Keychain'), 0.8), (('Mango', 'Keychain'), 0.6), (('Eggs', 'Onion'), 0.6), (('Keychain', 'Onion'), 0.6), (('Keychain', 'Chocolate'), 0.6), (('Eggs', 'Keychain', 'Onion'), 0.6)]
('Eggs', 'Keychain', 'Onion')

Using any similarity based techniques develop an application to classify text data. Perform pre-processing tasks as per requirement

PROGRAM

filename: b9.py

from math import sqrt
from math import log
from collections import Counter
from operator import itemgetter

def tf(kt,doc):
return (doc.count(kt))

def idf(kt,all_docs):
num=0
for x in all_docs:
if kt in x:
num=num+1
if num>0:
return round(float(log(float(len(all_docs))/float(num))),3)
else:
return 0

def tfidf(kt,doc):
return (tf(kt,doc)*idf(kt,all_docs))

def cos_sim(infile,docs,ktrms):
a=0
for x in ktrms:
a=a+tfidf(x,infile)*tfidf(x,docs)
b=doclen(infile,ktrms)*doclen(docs,ktrms)
if not b:
return 0
else:
return (round((a/b),3))

def doclen(doc,ktrms):
val=0
for x in ktrms:
val=val+pow(tfidf(x,doc),2)
return sqrt(val)

files=[]
all_docs=[]
key_terms=[]

documents=['doc1.txt','doc2.txt','doc3.txt','doc4.txt','doc5.txt','doc6.txt']
result=[['doc1.txt','animals'],['doc2.txt','animals'],['doc3.txt','animals'],['doc4.txt','sports'],['doc5.txt','sports'],['doc6.txt','sports']]

for x in documents:
files.append(open(x,'r').read())

for x in files:
all_docs.append(x.lower().rstrip('\n'))

for x in all_docs:
key_terms=key_terms+x.split()
key_terms=set(key_terms)
key_terms=list(key_terms)

filename=raw_input("Enter test file: ")
inputfile=open(filename,'r').readline().lower()

cnt=0
for x in all_docs:
result[cnt]=result[cnt]+[cos_sim(inputfile,x,key_terms)]
cnt=cnt+1
print result

k=3
sortedresult=sorted(result,key=itemgetter(2),reverse=True)
top_k=sortedresult[:k]
top_k[:]=(x for x in top_k if x[2]!=0)
if len(top_k)==0:
print "Does not match"
else:
class_count=Counter(category for (document,category,value) in top_k)
print class_count
classification=max(class_count,key=lambda cls:class_count[cls])
print "Class of test file: ",classification

'''
TF: Term Frequency, which measures how frequently a term occurs in a document. Since every document is different in length, it is possible that a term would appear much more times in long documents than shorter ones. Thus, the term frequency is often divided by the document length (aka. the total number of terms in the document) as a way of normalization:
TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document).

IDF: Inverse Document Frequency, which measures how important a term is. While computing TF, all terms are considered equally important. However it is known that certain terms, such as "is", "of", and "that", may appear a lot of times but have little importance. Thus we need to weigh down the frequent terms while scale up the rare ones, by computing the following:
IDF(t) = log_e(Total number of documents / Number of documents with term t in it).
'''

filename: doc1.txt
Animals live on land and water.Land animals include cats, cows.Water animals include all types of fishes.

filename: doc2.txt
Animals can be classified as herbivorous which is plant eating, carnivorous which is flesh eating and omnivorous which is eating both.

filename: doc3.txt
All land animals have two eyes and ears.They have one tail.

filename: doc4.txt
Sports are all forms of usually competitive activity which,through casual or organised participation, aim to use, maintain or improve physical ability and skills.

filename: doc5.txt
Different sport have different rules.

filename: doc6.txt
Depending on the sport, every sport has different number of players in each team.

filename: t.txt
Dog is an omnivorous creature that lives on land and has one tail.

filename: t2.txt
Football is a competitive sport with eleven players.

OUTPUT

Amols-Air:b9 Darwin$ python b9.py
Enter test file: t.txt
[['doc1.txt', 'animals', 0.249], ['doc2.txt', 'animals', 0.162], ['doc3.txt', 'animals', 0.388], ['doc4.txt', 'sports', 0.067], ['doc5.txt', 'sports', 0.011], ['doc6.txt', 'sports', 0.164]]
Counter({'animals': 2, 'sports': 1})
Class of test file: animals

Implementing recursive descent parser for sample language

PROGRAM

filename: b7.py

"""
exp ::= term | exp + term | exp - term
term ::= factor | factor * term | factor / term
factor ::= number | ( exp )
"""
class Calculator():
def __init__(self, tokens):
self._tokens = tokens
self._current = tokens[0]
def exp(self):
result = self.term()
while self._current in ('+', '-'):
if self._current == '+':
self.next()
result += self.term()
if self._current == '-':
self.next()
result -= self.term()
return result
def factor(self):
result = None
if self._current[0].isdigit() or self._current[-1].isdigit():
result = float(self._current)
self.next()
elif self._current is '(':
self.next()
result = self.exp()
self.next()
return result
def next(self):
self._tokens = self._tokens[1:]
self._current = self._tokens[0] if len(self._tokens) > 0 else None
def term(self):
result = self.factor()
while self._current in ('*', '/'):
if self._current == '*':
self.next()
result *= self.term()
if self._current == '/':
self.next()
result /= self.term()
return result

if __name__ == '__main__':
flag=0
while True:
lst = list(raw_input('> ').replace(' ', ''))
tokens = []
for i in range(len(lst)):
if(i==len(lst)-1 and lst[i] in ('*','/','-','+')):
print "Syntax Error String not accepted"
flag=1
break
if lst[i].isdigit() and i > 0 and (tokens[-1].isdigit() or tokens[-1][-1] is '.'):
tokens[-1] += lst[i]
elif lst[i] is '.' and i > 0 and tokens[-1].isdigit():
tokens[-1] += lst[i]
else:
tokens.append(lst[i])

if flag!=1:
print Calculator(tokens).exp()

OUTPUT

Amols-MacBook-Air:Codes Darwin$ python b7.py
> 5+8*9
77.0
> 4/4
1.0
> 4+2+
Syntax Error String not accepted
> ^CTraceback (most recent call last):
File "b7.py", line 47, in <module>
lst = list(raw_input('> ').replace(' ', ''))
KeyboardInterrupt

Code optimization using DAG

PROGRAM

filename: b4.py

from sys import argv

mp = {}
list_statement = []
replace = {}
not_used = []

class ExternalNode():
variable_identifier = ''
def __init__(self, identifier):
self.variable_identifier = identifier

class InternalNode():
variable_identifier = ''
operator_value = ''
lchild = None
rchild = None
def __init__(self, val, l, r, id):
self.operator_value = val
self.lchild = l
self.rchild = r
self.variable_identifier = id

def read_input(file_name):
file_var = open(file_name, 'r').read().strip().split("\n")
for line in file_var:
if len(line)>=6 and line[:6] == "return":
break
line = line.split(":=")
value = line[0].strip()
line [1] = line[1].strip().split(" ")
first = line[1][0]
operator = line[1][1]
second = line[1][2]
if (operator == '=') or (not first in mp):
temp_var = ExternalNode(first)
mp[first] = temp_var
replace[first] = first
if not second in mp:
temp_var = ExternalNode(second)
mp[second] = temp_var
replace[second] = second

list_statement.append(value)
var = InternalNode(operator, mp[first], mp[second], value)
mp[value] = var
replace[value] = value

def print_code():
for st in list_statement:
if not st in not_used:
print mp[st].variable_identifier + " = " + replace[mp[st].lchild.variable_identifier] + " " + mp[st].operator_value + " " + replace[mp[st].rchild.variable_identifier]

def optimise():
for i in range(len(list_statement)):
for j in range(i, len(list_statement)):
st = list_statement[i]
st2 = list_statement[j]
if mp[st].lchild == mp[st2].lchild and mp[st].rchild == mp[st2].rchild and mp[st].operator_value == mp[st2].operator_value:
replace[st2] = replace[st]
if st2 != st:
not_used.append(st2)

if __name__ == "__main__":
file_name = "intermediate_code"
if len(argv) == 2:
file_name = argv[1]
read_input(file_name)
optimise()
print_code()

filename: intermediate_code

t1 := a + b
t2 := c * t1
t3 := a + b
t4 := d * t3
t5 := t2 + t4
t6 := r = t5
t7 := t3 + a
t0 := a = t4
t8 := a + b
t9 := a + b
t10 := t9 + a
t11 := a = l
t12 := a + b
t13 := a + b
t14 := j + t13

OUTPUT

Amols-Air:b4 Darwin$ python b4.py
t1 = a + b
t2 = c * t1
t4 = d * t1
t5 = t2 + t4
t6 = r = t5
t7 = t1 + a
t0 = a = t4
t8 = a + b
t10 = t8 + a
t11 = a = l
t12 = a + b
t14 = j + t12

{iCode}