Skip to content

Commit

Permalink
fpu add fpuIgnoreSubnormal (save about 1K lut (64 bits) and quite som…
Browse files Browse the repository at this point in the history
…e timings)
  • Loading branch information
Dolu1990 committed Jul 11, 2024
1 parent b828c85 commit ed777de
Show file tree
Hide file tree
Showing 5 changed files with 100 additions and 52 deletions.
14 changes: 10 additions & 4 deletions src/main/scala/vexiiriscv/Param.scala
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ class ParamSimple(){
var withRvf = false
var skipFma = false
var fpuFmaFullAccuracy = true
var fpuIgnoreSubnormal = false
var withRvd = false
var withRvZb = false
var privParam = PrivilegedParam.base
Expand Down Expand Up @@ -188,8 +189,12 @@ class ParamSimple(){
withAlignerBuffer = true
// withRvc = true
withRva = true
// withRvf = true
// withRvd = true

withRvf = true
withRvd = true
fpuIgnoreSubnormal = true
fpuFmaFullAccuracy = false

withMmu = true
privParam.withSupervisor = true
privParam.withUser = true
Expand Down Expand Up @@ -297,6 +302,7 @@ class ParamSimple(){
opt[Unit]("with-rvc") action { (v, c) => withRvc = true; withAlignerBuffer = true }
opt[Unit]("with-rvZb") action { (v, c) => withRvZb = true }
opt[Unit]("fma-reduced-accuracy") action { (v, c) => fpuFmaFullAccuracy = false }
opt[Unit]("fpu-ignore-subnormal") action { (v, c) => fpuIgnoreSubnormal = true }
opt[Unit]("with-aligner-buffer") unbounded() action { (v, c) => withAlignerBuffer = true }
opt[Unit]("with-dispatcher-buffer") action { (v, c) => withDispatcherBuffer = true }
opt[Unit]("with-supervisor") action { (v, c) => privParam.withSupervisor = true; privParam.withUser = true; withMmu = true }
Expand Down Expand Up @@ -706,7 +712,7 @@ class ParamSimple(){
plugins += new WriteBackPlugin(lane0, FloatRegFile, writeAt = 9, allowBypassFrom = allowBypassFrom.max(2)) //Max 2 to save area on not so important instructions
plugins += new execute.fpu.FpuFlagsWritebackPlugin(lane0, pipTo = intWritebackAt)
plugins += new execute.fpu.FpuCsrPlugin(List(lane0), intWritebackAt)
plugins += new execute.fpu.FpuUnpackerPlugin(early0)
plugins += new execute.fpu.FpuUnpackerPlugin(early0, ignoreSubnormal = fpuIgnoreSubnormal)
plugins += new execute.fpu.FpuAddSharedPlugin(lane0)
plugins += new execute.fpu.FpuAddPlugin(early0)
plugins += new execute.fpu.FpuMulPlugin(early0, withFma = !skipFma, fmaFullAccuracy = fpuFmaFullAccuracy)
Expand All @@ -717,7 +723,7 @@ class ParamSimple(){
plugins += new execute.fpu.FpuMvPlugin(early0, floatWbAt = 2)
if(withRvd) plugins += new execute.fpu.FpuXxPlugin(early0)
plugins += new execute.fpu.FpuDivPlugin(early0)
plugins += new execute.fpu.FpuPackerPlugin(lane0)
plugins += new execute.fpu.FpuPackerPlugin(lane0, ignoreSubnormal = fpuIgnoreSubnormal)
// plugins += new execute.fpu.FpuEmbedded()
}

Expand Down
44 changes: 25 additions & 19 deletions src/main/scala/vexiiriscv/execute/fpu/FpuPackerPlugin.scala
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer


case class FpuPackerCmd(p : FloatUnpackedParam, ats : Seq[Int]) extends Bundle{
case class FpuPackerCmd(p : FloatUnpackedParam,
ats : Seq[Int]) extends Bundle{
val at = Bits(ats.size bits)
val value = FloatUnpacked(p)
val format = FpuFormat()
Expand All @@ -31,6 +32,7 @@ class FpuPackerPort(_cmd : FpuPackerCmd) extends Area{
}

class FpuPackerPlugin(val lane: ExecuteLanePlugin,
var ignoreSubnormal: Boolean = false,
var wbAt : Int = 2) extends FiberPlugin with RegFileWriterService {
val p = FpuUtils

Expand Down Expand Up @@ -101,29 +103,33 @@ class FpuPackerPlugin(val lane: ExecuteLanePlugin,
}

val EXP_SUBNORMAL = insert(AFix(p.muxDouble[SInt](FORMAT)(-1023)(-127)))
val SUBNORMAL = insert(VALUE.exponent <= EXP_SUBNORMAL && VALUE.isNormal)
val subnormal = !ignoreSubnormal generate new Area{
val ENABLE = insert(ignoreSubnormal.mux(False, VALUE.exponent <= EXP_SUBNORMAL && VALUE.isNormal))
}
}

import s0._


val s1 = new pip.Area(1) {
val MAN_SHIFTED = insert(U(VALUE.mantissa.raw))

// First we check if we are subnormal, in which case we need to denormalize the mantissa
val EXP_DIF_PLUS_ONE = insert(U(EXP_SUBNORMAL - VALUE.exponent) + 1)

val manShiftNoSat = EXP_DIF_PLUS_ONE
val manShift = RegNext(manShiftNoSat.sat(widthOf(manShiftNoSat) - log2Up(p.mantissaWidth + 2)))
val manShifter = RegNext(U(Shift.rightWithScrap(True ## VALUE.mantissa.raw, manShift).dropHigh(1)))
val MAN_SHIFTED = insert(manShifter)
when(!SUBNORMAL){
MAN_SHIFTED := U(VALUE.mantissa.raw)
val subnormal = !ignoreSubnormal generate new Area {
val EXP_DIF_PLUS_ONE = insert(U(EXP_SUBNORMAL - VALUE.exponent) + 1)
val manShiftNoSat = EXP_DIF_PLUS_ONE
val manShift = RegNext(manShiftNoSat.sat(widthOf(manShiftNoSat) - log2Up(p.mantissaWidth + 2)))
val manShifter = RegNext(U(Shift.rightWithScrap(True ## VALUE.mantissa.raw, manShift).dropHigh(1)))
when(s0.subnormal.ENABLE) {
MAN_SHIFTED := manShifter
}
val counter = Reg(UInt(2 bits)) init(0)
val freezeIt = isValid && s0.subnormal.ENABLE && counter =/= 2
lane.freezeWhen(freezeIt)
when(freezeIt) { counter := counter + 1 }
when(!lane.isFreezed()){ counter := 0 }
}

val counter = Reg(UInt(2 bits)) init(0)
val freezeIt = isValid && SUBNORMAL && counter =/= 2
lane.freezeWhen(freezeIt)
when(freezeIt) { counter := counter + 1 }
when(!lane.isFreezed()){ counter := 0 }

val f32ManPos = p.mantissaWidth + 2 - 23
val roundAdjusted = insert(p.muxDouble(FORMAT)(MAN_SHIFTED(0, 2 bits))(MAN_SHIFTED(f32ManPos - 2, 2 bits) | U(MAN_SHIFTED(f32ManPos - 2 - 1 downto 0).orR, 2 bits)))
Expand All @@ -149,11 +155,11 @@ class FpuPackerPlugin(val lane: ExecuteLanePlugin,
import s1._

val s2 = new pip.Area(wbAt) {
val SUBNORMAL_FINAL = insert((EXP_SUBNORMAL - EXP_RESULT).isPositive())
val SUBNORMAL_FINAL = insert(ignoreSubnormal.mux(False, (EXP_SUBNORMAL - EXP_RESULT).isPositive()))
val EXP = insert(!SUBNORMAL_FINAL ? (EXP_RESULT - EXP_SUBNORMAL) | AFix(0))

val EXP_MAX = insert(AFix(p.muxDouble[SInt](FORMAT)(1023)(127)))
val EXP_MIN = insert(AFix(p.muxDouble[SInt](FORMAT)(-1023 - 52 + 1)(-127 - 23 + 1)))
val EXP_MIN = insert(AFix(p.muxDouble[SInt](FORMAT)(-1023 - ignoreSubnormal.mux(0, 52 + 1))(-127 - ignoreSubnormal.mux(0, 23 + 1))))
val EXP_OVERFLOW = insert(EXP_RESULT > EXP_MAX)
val EXP_UNDERFLOW = insert(EXP_RESULT < EXP_MIN)

Expand Down Expand Up @@ -195,9 +201,9 @@ class FpuPackerPlugin(val lane: ExecuteLanePlugin,
manQuiet := VALUE.quiet
}
is(FloatMode.NORMAL) {
when(roundAdjusted =/= 0) {
if(!ignoreSubnormal) when(roundAdjusted =/= 0) {
nx := True
when(SUBNORMAL_FINAL || SUBNORMAL && !tinyOverflow) {
when(SUBNORMAL_FINAL || s0.subnormal.ENABLE && !tinyOverflow) {
uf := True
}
}
Expand Down
37 changes: 22 additions & 15 deletions src/main/scala/vexiiriscv/execute/fpu/FpuUnpackerPlugin.scala
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@ import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer


class FpuUnpackerPlugin(val layer : LaneLayer, unpackAt : Int = 0, packAt : Int = 0) extends FiberPlugin{
class FpuUnpackerPlugin(val layer : LaneLayer,
var ignoreSubnormal : Boolean = false,
unpackAt : Int = 0,
packAt : Int = 0) extends FiberPlugin{
val p = FpuUtils

val elaborationLock = Retainer()
Expand Down Expand Up @@ -97,6 +100,8 @@ class FpuUnpackerPlugin(val layer : LaneLayer, unpackAt : Int = 0, packAt : Int

val rsList = unpackSpec.keys.toArray

val withRsUnpack = !ignoreSubnormal

val unpacker = new StagePipeline { //TODO this kinda bloated now that all unpack are unified
val ohInputWidth = p.rsIntWidth max Riscv.fpuMantissaWidth

Expand All @@ -109,7 +114,7 @@ class FpuUnpackerPlugin(val layer : LaneLayer, unpackAt : Int = 0, packAt : Int
val data = Bits(ohInputWidth bits)
}

val portCount = 2
val portCount = 1+withRsUnpack.toInt
val arbiter = StreamArbiterFactory().noLock.lowerFirst.build(Request(), portCount)
val results = Vec.fill(portCount)(Flow(Result()))

Expand Down Expand Up @@ -145,10 +150,12 @@ class FpuUnpackerPlugin(val layer : LaneLayer, unpackAt : Int = 0, packAt : Int


val onUnpack = new layer.lane.Execute(unpackAt){
val fsmPortId = 0
val fsmCmd = unpacker.arbiter.io.inputs(fsmPortId)
val fsmRsp = unpacker.results(fsmPortId)
fsmCmd.setIdle()
val fsmPort = withRsUnpack generate new Area {
val id = 0
val cmd = unpacker.arbiter.io.inputs(id)
val rsp = unpacker.results(id)
cmd.setIdle()
}

val firstCycle = RegNext(False) setWhen(!layer.lane.isFreezed())

Expand All @@ -173,7 +180,6 @@ class FpuUnpackerPlugin(val layer : LaneLayer, unpackAt : Int = 0, packAt : Int
mantissaWidth = Riscv.fpuMantissaWidth
))

val unpackerSel = isValid && up(rfa.ENABLE) && rfa.is(FloatRegFile, rfa.RFID) && !up(TRAP) //A bit pessimistic, as not all float instruction will need unpacking

val f32 = new Area {
val mantissa = input(0, 23 bits).asUInt
Expand Down Expand Up @@ -222,28 +228,29 @@ class FpuUnpackerPlugin(val layer : LaneLayer, unpackAt : Int = 0, packAt : Int
2 -> (manZero ? FloatMode.INF | FloatMode.NAN)
)
apply(RS) := RS_PRE_NORM
val normalizer = new Area {
val normalizer = withRsUnpack generate new Area {
val unpackerSel = isValid && up(rfa.ENABLE) && rfa.is(FloatRegFile, rfa.RFID) && !up(TRAP) //A bit pessimistic, as not all float instruction will need unpacking
val valid = unpackerSel && IS_SUBNORMAL
val validReg = RegNext(unpackerSel && IS_SUBNORMAL ) clearWhen(!layer.lane.isFreezed()) init(False)
val asked = RegInit(False) setWhen (fsmRequesters(inputId) && !fsmRequesters.dropLow(inputId + 1).orR || isCancel) clearWhen (clear)
val served = RegInit(False) setWhen (fsmRsp.valid && fsmServed.dropLow(inputId + 1).andR || isCancel) clearWhen (clear)
val served = RegInit(False) setWhen (fsmPort.rsp.valid && fsmServed.dropLow(inputId + 1).andR || isCancel) clearWhen (clear)
fsmRequesters(inputId) := valid && !asked
fsmServed(inputId) := !valid || served

val exponent = Reg(RS.exponent)
val mantissa = Reg(RS.mantissa)

when(fsmRequesters(inputId)) {
fsmCmd.valid := True
fsmCmd.data := RS_PRE_NORM.mantissa.raw << widthOf(fsmCmd.data) - widthOf(RS_PRE_NORM.mantissa.raw)
fsmPort.cmd.valid := True
fsmPort.cmd.data := RS_PRE_NORM.mantissa.raw << widthOf(fsmPort.cmd.data) - widthOf(RS_PRE_NORM.mantissa.raw)
}
when(asked) {
RS.exponent := exponent
RS.mantissa := mantissa
}
when(!served) {
exponent := recodedExpSub - fsmRsp.shift.intoSInt
mantissa.raw := fsmRsp.data >> widthOf(fsmCmd.data) - widthOf(RS_PRE_NORM.mantissa.raw)
exponent := recodedExpSub - fsmPort.rsp.shift.intoSInt
mantissa.raw := fsmPort.rsp.data >> widthOf(fsmPort.cmd.data) - widthOf(RS_PRE_NORM.mantissa.raw)
}
val freezeIt = validReg && !served || firstCycle && unpackerSel && expZero //Maybe a bit hard on timings
layer.lane.freezeWhen(freezeIt)
Expand All @@ -259,7 +266,7 @@ class FpuUnpackerPlugin(val layer : LaneLayer, unpackAt : Int = 0, packAt : Int
}
}

val unpackDone = !onUnpack.rs.map(_.normalizer.freezeIt).toList.orR
val unpackDone = withRsUnpack.mux(!onUnpack.rs.map(_.normalizer.freezeIt).toList.orR, True)


val onCvt = new layer.lane.Execute(unpackAt){ //TODO fmax
Expand All @@ -269,7 +276,7 @@ class FpuUnpackerPlugin(val layer : LaneLayer, unpackAt : Int = 0, packAt : Int
case 64 => rs1(31 downto 0) === 0 && (RsUnsignedPlugin.IS_W || rs1(63 downto 32) === 0)
}

val fsmPortId = 1
val fsmPortId = withRsUnpack.toInt
val fsmCmd = unpacker.arbiter.io.inputs(fsmPortId)
val fsmRsp = unpacker.results(fsmPortId)
val clear = isReady
Expand Down
8 changes: 6 additions & 2 deletions src/main/scala/vexiiriscv/soc/litex/Soc.scala
Original file line number Diff line number Diff line change
Expand Up @@ -656,8 +656,12 @@ mpg123 -a bluealsa mp3/01-long_distance_calling-metulsky_curse_revisited.mp3
--sbc-quality=low
perf stat -e branch-misses,branches,l1-dcache-loads,l1-dcache-load-misses,l1-icache-loads,l1-icache-load-misses,cycles,instructions ls
r12,r13,r1a,r1b,stalled-cycles-frontend,stalled-cycles-backend,cycles,instructions,branch-misses,branches
perf stat -p $! --timeout 1000 -e branch-misses,branches,l1-dcache-loads,l1-dcache-load-misses,l1-icache-loads,l1-icache-load-misses,cycles,instructions
perf stat -p $! --timeout 1000 -e r12,r13,r1a,r1b,stalled-cycles-frontend,stalled-cycles-backend,cycles,instructions,branch-misses,branches
perf stat -p $! --timeout 1000 -e r12,r13,r1a,r1b,cycles,instructions,branch-misses,branches
perf stat -p $! --timeout 1000 -e stalled-cycles-frontend,stalled-cycles-backend,cycles,instructions
r8000000000000000,r8000000000000001,r8000000000000004
~/c/libsdl2/libsdl2-2.30.2+dfsg/debian/build-tests# make -j1 check "testsuiteflags=-j1 --verbose" verbose=1 v=1 &> testlog.txt
Expand Down
49 changes: 37 additions & 12 deletions src/test/scala/vexiiriscv/scratchpad/Synt.scala
Original file line number Diff line number Diff line change
Expand Up @@ -661,15 +661,24 @@ object IntegrationSynthBench extends App{
lsuL1Sets = 64
lsuL1Ways = 4

withMmu = true
privParam.withSupervisor = true;
privParam.withUser = true;

lsuL1RefillCount = 2
lsuL1WritebackCount = 2
lsuStoreBufferSlots = 2
lsuStoreBufferOps = 32
lsuL1Coherency = true

lsuHardwarePrefetch = "rpt"
lsuSoftwarePrefetch = true

withRvf = true
withRvd = true
fpuFmaFullAccuracy = false


}

def debianTweeked(name : String)(body : ParamSimple => Unit) : Unit = {
Expand Down Expand Up @@ -698,25 +707,26 @@ object IntegrationSynthBench extends App{
// param.allowBypassFrom = 100
// }
//
// debianTweeked("vexii_debian_nofpu") { param =>
// param.withRvf = false
// param.withRvd = false
// }
debianTweeked("vexii_debian_nofpu") { param =>
param.withRvf = false
param.withRvd = false
}
//
//
// debianTweeked("vexii_debian_nobp") { param =>
// param.allowBypassFrom = 100
// }
//
// debianTweeked("vexii_debian"){param =>$
//
// }

debianTweeked("vexii_debian_full") { param =>
param.lsuHardwarePrefetch = "rpt"
param.lsuSoftwarePrefetch = true
debianTweeked("vexii_debian"){param =>

}

debianTweeked("vexii_debian_ignoreSubnormal") { param =>
param.fpuIgnoreSubnormal = true
}



//
// debianTweeked("vexii_debian_no_fpu_dual_issue") { param =>
// param.withRvf = false
Expand Down Expand Up @@ -764,7 +774,7 @@ object IntegrationSynthBench extends App{


val targets = ArrayBuffer[Target]()
targets ++= XilinxStdTargets(withFMax = true, withArea = false)
targets ++= XilinxStdTargets(withFMax = false, withArea = true)
// targets ++= AlteraStdTargets()
// targets ++= EfinixStdTargets(withFMax = true, withArea = true)

Expand All @@ -773,6 +783,21 @@ object IntegrationSynthBench extends App{

/*
vexii_debian ->
Artix 7 -> 71 Mhz 11397 LUT 7959 FF
Artix 7 -> 150 Mhz 12586 LUT 7982 FF
vexii_debian_ignoreSubnormal ->
Artix 7 -> 82 Mhz 10672 LUT 7706 FF
Artix 7 -> 157 Mhz 11681 LUT 7723 F
vexii_debian_nofpu ->
Artix 7 -> 90 Mhz 6391 LUT 4975 FF
Artix 7 -> 164 Mhz 7137 LUT 5070 FF
vexii_debian_full ->
Artix 7 -> 158 Mhz 12352 LUT 7986 FF
Artix 7 -> 163 Mhz 12015 LUT 7944 FF
vexii_debian_nofpu_nobp ->
Expand Down

0 comments on commit ed777de

Please sign in to comment.