est ici une solution shell. Le meilleur algorithme semble être # 4. Il filtre tous les mots de longueur incorrecte. Ensuite, il somme les mots en utilisant un simple chiffre de substitution (a = 1, b = 2, A = 27, ...). Si les sommes correspondent, alors le tri original sera effectué et comparé. Sur mon système, il peut résonner en ~ 235k mots à la recherche de "chauve-souris" en un peu moins d'une demi-seconde. Je fournis toutes mes solutions pour que vous puissiez voir les différentes approches.
Mise à jour: non montré, mais j'ai aussi essayé de mettre la somme dans la première case de l'approche de l'histogramme que j'ai essayée, mais c'était encore plus lent que les histogrammes sans. Je pensais que cela fonctionnerait comme un court-circuit, mais cela n'a pas fonctionné.
Mise à jour 2: J'ai essayé la solution awk et elle fonctionne en environ 1/3 du temps de ma meilleure solution shell ou ~ 0,126s contre ~ 0,490s. La solution Perl s'exécute ~ 1,1s.
#!/bin/bash
word=$1
#dict=words
dict=/usr/share/dict/words
#dict=/usr/dict/words
alg1() {
sortedWord=`echo $word | grep -o . | sort | tr -d '\n'`
while read line
do
sortedLine=`echo $line | grep -o . | sort | tr -d '\n'`
if [ "$sortedWord" == "$sortedLine" ]
then
echo $line
fi
done < $dict
}
check_sorted_versus_not() {
local word=$1
local line=`echo $2 | grep -o . | sort | tr -d '\n'`
if [ "$word" == "$line" ]
then
echo $2
fi
}
# Filter out all words of incorrect length
alg2() {
sortedWord=`echo $word | grep -o . | sort | tr -d '\n'`
grep_string="^`echo -n $word | tr 'a-zA-Z' '.'`\$"
grep "$grep_string" "$dict" | \
while read line
do
sortedLine=`echo $line | grep -o . | sort | tr -d '\n'`
if [ "$sortedWord" == "$sortedLine" ]
then
echo $line
fi
done
}
# Create a lot of variables like this:
# _a=1, _b=2, ... _z=26, _A=27, _B=28, ... _Z=52
gen_chars() {
# [ -n "$GEN_CHARS" ] && return
GEN_CHARS=1
local alpha="abcdefghijklmnopqrstuvwxyz"
local upperalpha=`echo -n $alpha | tr 'a-z' 'A-Z'`
local both="$alpha$upperalpha"
for ((i=0; i < ${#both}; i++))
do
ACHAR=${both:i:1}
eval "_$ACHAR=$((i+1))"
done
}
# I think it's faster to return the value in a var then to echo it in a sub process.
# Try summing the word one char at a time by building an arithmetic expression
# and then evaluate that expression.
# Requires: gen_chars
sum_word() {
SUM=0
local s=""
# parsing input one character at a time
for ((i=0; i < ${#1}; i++))
do
ACHAR=${1:i:1}
s="$s\$_$ACHAR+"
done
SUM=$(($(eval echo -n ${s}0)))
}
# I think it's faster to return the value in a var then to echo it in a sub process.
# Try summing the word one char at a time using a case statement.
sum_word2() {
SUM=0
local s=""
# parsing input one character at a time
for ((i=0; i < ${#1}; i++))
do
ACHAR=${1:i:1}
case $ACHAR in
a) SUM=$((SUM+ 1));;
b) SUM=$((SUM+ 2));;
c) SUM=$((SUM+ 3));;
d) SUM=$((SUM+ 4));;
e) SUM=$((SUM+ 5));;
f) SUM=$((SUM+ 6));;
g) SUM=$((SUM+ 7));;
h) SUM=$((SUM+ 8));;
i) SUM=$((SUM+ 9));;
j) SUM=$((SUM+ 10));;
k) SUM=$((SUM+ 11));;
l) SUM=$((SUM+ 12));;
m) SUM=$((SUM+ 13));;
n) SUM=$((SUM+ 14));;
o) SUM=$((SUM+ 15));;
p) SUM=$((SUM+ 16));;
q) SUM=$((SUM+ 17));;
r) SUM=$((SUM+ 18));;
s) SUM=$((SUM+ 19));;
t) SUM=$((SUM+ 20));;
u) SUM=$((SUM+ 21));;
v) SUM=$((SUM+ 22));;
w) SUM=$((SUM+ 23));;
x) SUM=$((SUM+ 24));;
y) SUM=$((SUM+ 25));;
z) SUM=$((SUM+ 26));;
A) SUM=$((SUM+ 27));;
B) SUM=$((SUM+ 28));;
C) SUM=$((SUM+ 29));;
D) SUM=$((SUM+ 30));;
E) SUM=$((SUM+ 31));;
F) SUM=$((SUM+ 32));;
G) SUM=$((SUM+ 33));;
H) SUM=$((SUM+ 34));;
I) SUM=$((SUM+ 35));;
J) SUM=$((SUM+ 36));;
K) SUM=$((SUM+ 37));;
L) SUM=$((SUM+ 38));;
M) SUM=$((SUM+ 39));;
N) SUM=$((SUM+ 40));;
O) SUM=$((SUM+ 41));;
P) SUM=$((SUM+ 42));;
Q) SUM=$((SUM+ 43));;
R) SUM=$((SUM+ 44));;
S) SUM=$((SUM+ 45));;
T) SUM=$((SUM+ 46));;
U) SUM=$((SUM+ 47));;
V) SUM=$((SUM+ 48));;
W) SUM=$((SUM+ 49));;
X) SUM=$((SUM+ 50));;
Y) SUM=$((SUM+ 51));;
Z) SUM=$((SUM+ 52));;
*) SUM=0; return;;
esac
done
}
# I think it's faster to return the value in a var then to echo it in a sub process.
# Try summing the word by building an arithmetic expression using sed and then evaluating
# the expression.
# Requires: gen_chars
sum_word3() {
SUM=$(($(eval echo -n `echo -n $1 | sed -E -ne 's,.,$_&+,pg'`) 0))
#echo "SUM($1)=$SUM"
}
# Filter out all words of incorrect length
# Sum the characters in the word: i.e. a=1, b=2, ... and "abbc" = 1+2+2+3 = 8
alg3() {
gen_chars
sortedWord=`echo $word | grep -o . | sort | tr -d '\n'`
sum_word $word
word_sum=$SUM
grep_string="^`echo -n $word | tr 'a-zA-Z' '.'`\$"
grep "$grep_string" "$dict" | \
while read line
do
sum_word $line
line_sum=$SUM
if [ $word_sum == $line_sum ]
then
check_sorted_versus_not $sortedWord $line
fi
done
}
# Filter out all words of incorrect length
# Sum the characters in the word: i.e. a=1, b=2, ... and "abbc" = 1+2+2+3 = 8
# Use sum_word2
alg4() {
sortedWord=`echo $word | grep -o . | sort | tr -d '\n'`
sum_word2 $word
word_sum=$SUM
grep_string="^`echo -n $word | tr 'a-zA-Z' '.'`\$"
grep "$grep_string" "$dict" | \
while read line
do
sum_word2 $line
line_sum=$SUM
if [ $word_sum == $line_sum ]
then
check_sorted_versus_not $sortedWord $line
fi
done
}
# Filter out all words of incorrect length
# Sum the characters in the word: i.e. a=1, b=2, ... and "abbc" = 1+2+2+3 = 8
# Use sum_word3
alg5() {
gen_chars
sortedWord=`echo $word | grep -o . | sort | tr -d '\n'`
sum_word3 $word
word_sum=$SUM
grep_string="^`echo -n $word | tr 'a-zA-Z' '.'`\$"
grep "$grep_string" "$dict" | \
while read line
do
sum_word3 $line
line_sum=$SUM
if [ $word_sum == $line_sum ]
then
check_sorted_versus_not $sortedWord $line
fi
done
}
# I think it's faster to return the value in a var then to echo it in a sub process.
# Try summing the word one char at a time using a case statement.
# Place results in a histogram
sum_word4() {
SUM=(0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0
0)
# parsing input one character at a time
for ((i=0; i < ${#1}; i++))
do
ACHAR=${1:i:1}
case $ACHAR in
a) SUM[1]=$((SUM[ 1] + 1));;
b) SUM[2]=$((SUM[ 2] + 1));;
c) SUM[3]=$((SUM[ 3] + 1));;
d) SUM[4]=$((SUM[ 4] + 1));;
e) SUM[5]=$((SUM[ 5] + 1));;
f) SUM[6]=$((SUM[ 6] + 1));;
g) SUM[7]=$((SUM[ 7] + 1));;
h) SUM[8]=$((SUM[ 8] + 1));;
i) SUM[9]=$((SUM[ 9] + 1));;
j) SUM[10]=$((SUM[10] + 1));;
k) SUM[11]=$((SUM[11] + 1));;
l) SUM[12]=$((SUM[12] + 1));;
m) SUM[13]=$((SUM[13] + 1));;
n) SUM[14]=$((SUM[14] + 1));;
o) SUM[15]=$((SUM[15] + 1));;
p) SUM[16]=$((SUM[16] + 1));;
q) SUM[17]=$((SUM[17] + 1));;
r) SUM[18]=$((SUM[18] + 1));;
s) SUM[19]=$((SUM[19] + 1));;
t) SUM[20]=$((SUM[20] + 1));;
u) SUM[21]=$((SUM[21] + 1));;
v) SUM[22]=$((SUM[22] + 1));;
w) SUM[23]=$((SUM[23] + 1));;
x) SUM[24]=$((SUM[24] + 1));;
y) SUM[25]=$((SUM[25] + 1));;
z) SUM[26]=$((SUM[26] + 1));;
A) SUM[27]=$((SUM[27] + 1));;
B) SUM[28]=$((SUM[28] + 1));;
C) SUM[29]=$((SUM[29] + 1));;
D) SUM[30]=$((SUM[30] + 1));;
E) SUM[31]=$((SUM[31] + 1));;
F) SUM[32]=$((SUM[32] + 1));;
G) SUM[33]=$((SUM[33] + 1));;
H) SUM[34]=$((SUM[34] + 1));;
I) SUM[35]=$((SUM[35] + 1));;
J) SUM[36]=$((SUM[36] + 1));;
K) SUM[37]=$((SUM[37] + 1));;
L) SUM[38]=$((SUM[38] + 1));;
M) SUM[39]=$((SUM[39] + 1));;
N) SUM[40]=$((SUM[40] + 1));;
O) SUM[41]=$((SUM[41] + 1));;
P) SUM[42]=$((SUM[42] + 1));;
Q) SUM[43]=$((SUM[43] + 1));;
R) SUM[44]=$((SUM[44] + 1));;
S) SUM[45]=$((SUM[45] + 1));;
T) SUM[46]=$((SUM[46] + 1));;
U) SUM[47]=$((SUM[47] + 1));;
V) SUM[48]=$((SUM[48] + 1));;
W) SUM[49]=$((SUM[49] + 1));;
X) SUM[50]=$((SUM[50] + 1));;
Y) SUM[51]=$((SUM[51] + 1));;
Z) SUM[52]=$((SUM[52] + 1));;
*) SUM[53]=-1; return;;
esac
done
#echo ${SUM[*]}
}
# Check if two histograms are equal
hist_are_equal() {
# Array sizes differ?
[ ${#_h1[*]} != ${#SUM[*]} ] && return 1
# parsing input one index at a time
for ((i=0; i < ${#_h1[*]}; i++))
do
[ ${_h1[i]} != ${SUM[i]} ] && return 1
done
return 0
}
# Check if two histograms are equal
hist_are_equal2() {
# Array sizes differ?
local size=${#_h1[*]}
[ $size != ${#SUM[*]} ] && return 1
# parsing input one index at a time
for ((i=0; i < $size; i++))
do
[ ${_h1[i]} != ${SUM[i]} ] && return 1
done
return 0
}
# Filter out all words of incorrect length
# Use sum_word4 which generates a histogram of character frequency
alg6() {
sum_word4 $word
_h1=${SUM[*]}
grep_string="^`echo -n $word | tr 'a-zA-Z' '.'`\$"
grep "$grep_string" "$dict" | \
while read line
do
sum_word4 $line
if hist_are_equal
then
echo $line
fi
done
}
# Filter out all words of incorrect length
# Use sum_word4 which generates a histogram of character frequency
alg7() {
sum_word4 $word
_h1=${SUM[*]}
grep_string="^`echo -n $word | tr 'a-zA-Z' '.'`\$"
grep "$grep_string" "$dict" | \
while read line
do
sum_word4 $line
if hist_are_equal2
then
echo $line
fi
done
}
run_test() {
echo alg$1
eval time alg$1
}
#run_test 1
#run_test 2
#run_test 3
run_test 4
#run_test 5
run_test 6
#run_test 7
Faut-il renvoyer 'tat'? – kennytm
votre fichier de dictionnaire est plus de 400000 entrées si je ne me trompe pas. appeler/piping vers une commande externe comme grep, sort, tr pour CHAQUE ligne du dictionnaire est inefficace. – ghostdog74
non, "tat" ne doit pas être retourné. Seules les permutations du mot original sont autorisées. – dogbane